In [1]:
import pandas as pd
import numpy as np
import re
import os
import pickle
import argparse
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import simple_icd_10 as icd
from collections import OrderedDict
from gensim.models import Word2Vec
import time

In [2]:
def get_roman_nums(num):
    roman_char_dict = OrderedDict()
    roman_char_dict[1000] = "M"
    roman_char_dict[900] = "CM"
    roman_char_dict[500] = "D"
    roman_char_dict[400] = "CD"
    roman_char_dict[100] = "C"
    roman_char_dict[90] = "XC"
    roman_char_dict[50] = "L"
    roman_char_dict[40] = "XL"
    roman_char_dict[10] = "X"
    roman_char_dict[9] = "IX"
    roman_char_dict[5] = "V"
    roman_char_dict[4] = "IV"
    roman_char_dict[1] = "I"

    def get_roman_num(num):
        for r in roman_char_dict.keys():
            x, y = divmod(num, r)
            yield roman_char_dict[r] * x
            num -= (r*x)
            if num < 0:
                break
    return "".join([x for x in get_roman_num(num)])

In [3]:
icd_edges = []
leaf_nodes = []
def get_child(keyword):
    code = icd.get_children(keyword)
    if len(code)==0:
        leaf_nodes.append(keyword)
        return
    else:
        # len(code) > 0
        for c in code:
            icd_edges.append([keyword, c])
            get_child(c)

# There are 22 chapters in ICD10
for i in range(1, 23):
    chapter = get_roman_nums(i)
    icd_edges.append(['root', chapter])
    get_child(chapter)

In [4]:
icd_edges

[['root', 'I'],
 ['I', 'A00-A09'],
 ['A00-A09', 'A00'],
 ['A00', 'A00.0'],
 ['A00', 'A00.1'],
 ['A00', 'A00.9'],
 ['A00-A09', 'A01'],
 ['A01', 'A01.0'],
 ['A01', 'A01.1'],
 ['A01', 'A01.2'],
 ['A01', 'A01.3'],
 ['A01', 'A01.4'],
 ['A00-A09', 'A02'],
 ['A02', 'A02.0'],
 ['A02', 'A02.1'],
 ['A02', 'A02.2'],
 ['A02', 'A02.8'],
 ['A02', 'A02.9'],
 ['A00-A09', 'A03'],
 ['A03', 'A03.0'],
 ['A03', 'A03.1'],
 ['A03', 'A03.2'],
 ['A03', 'A03.3'],
 ['A03', 'A03.8'],
 ['A03', 'A03.9'],
 ['A00-A09', 'A04'],
 ['A04', 'A04.0'],
 ['A04', 'A04.1'],
 ['A04', 'A04.2'],
 ['A04', 'A04.3'],
 ['A04', 'A04.4'],
 ['A04', 'A04.5'],
 ['A04', 'A04.6'],
 ['A04', 'A04.7'],
 ['A04', 'A04.8'],
 ['A04', 'A04.9'],
 ['A00-A09', 'A05'],
 ['A05', 'A05.0'],
 ['A05', 'A05.1'],
 ['A05', 'A05.2'],
 ['A05', 'A05.3'],
 ['A05', 'A05.4'],
 ['A05', 'A05.8'],
 ['A05', 'A05.9'],
 ['A00-A09', 'A06'],
 ['A06', 'A06.0'],
 ['A06', 'A06.1'],
 ['A06', 'A06.2'],
 ['A06', 'A06.3'],
 ['A06', 'A06.4'],
 ['A06', 'A06.5'],
 ['A06', 'A06.6'],
 

In [5]:
icd_edgelist = np.array(icd_edges.copy())
unique_icds = list(set(icd_edgelist.flatten()))
unique_icds.sort()

In [6]:
unique_icds

['A00',
 'A00-A09',
 'A00.0',
 'A00.1',
 'A00.9',
 'A01',
 'A01.0',
 'A01.1',
 'A01.2',
 'A01.3',
 'A01.4',
 'A02',
 'A02.0',
 'A02.1',
 'A02.2',
 'A02.8',
 'A02.9',
 'A03',
 'A03.0',
 'A03.1',
 'A03.2',
 'A03.3',
 'A03.8',
 'A03.9',
 'A04',
 'A04.0',
 'A04.1',
 'A04.2',
 'A04.3',
 'A04.4',
 'A04.5',
 'A04.6',
 'A04.7',
 'A04.8',
 'A04.9',
 'A05',
 'A05.0',
 'A05.1',
 'A05.2',
 'A05.3',
 'A05.4',
 'A05.8',
 'A05.9',
 'A06',
 'A06.0',
 'A06.1',
 'A06.2',
 'A06.3',
 'A06.4',
 'A06.5',
 'A06.6',
 'A06.7',
 'A06.8',
 'A06.9',
 'A07',
 'A07.0',
 'A07.1',
 'A07.2',
 'A07.3',
 'A07.8',
 'A07.9',
 'A08',
 'A08.0',
 'A08.1',
 'A08.2',
 'A08.3',
 'A08.4',
 'A08.5',
 'A09',
 'A09.0',
 'A09.9',
 'A15',
 'A15-A19',
 'A15.0',
 'A15.1',
 'A15.2',
 'A15.3',
 'A15.4',
 'A15.5',
 'A15.6',
 'A15.7',
 'A15.8',
 'A15.9',
 'A16',
 'A16.0',
 'A16.1',
 'A16.2',
 'A16.3',
 'A16.4',
 'A16.5',
 'A16.7',
 'A16.8',
 'A16.9',
 'A17',
 'A17.0',
 'A17.1',
 'A17.8',
 'A17.9',
 'A18',
 'A18.0',
 'A18.1',
 'A18.2',
 'A1

In [7]:
icd_descs = []
for icdcode in unique_icds:
    if icdcode == 'root':
        icd_descs.append('root')
    else:
        icd_descs.append(icd.get_description(icdcode))

In [8]:
model = Word2Vec(min_count=1,
                    window=5,
                    size=64,
                    sample=6e-5, 
                    alpha=0.03, 
                    min_alpha=0.0007, 
                    negative=20,
                    workers=16)

In [9]:
for x in icd_descs:
    print(x)

Cholera
Intestinal infectious diseases
Cholera due to Vibrio cholerae 01, biovar cholerae
Cholera due to Vibrio cholerae 01, biovar eltor
Cholera, unspecified
Typhoid and paratyphoid fevers
Typhoid fever
Paratyphoid fever A
Paratyphoid fever B
Paratyphoid fever C
Paratyphoid fever, unspecified
Other salmonella infections
Salmonella enteritis
Salmonella sepsis
Localized salmonella infections
Other specified salmonella infections
Salmonella infection, unspecified
Shigellosis
Shigellosis due to Shigella dysenteriae
Shigellosis due to Shigella flexneri
Shigellosis due to Shigella boydii
Shigellosis due to Shigella sonnei
Other shigellosis
Shigellosis, unspecified
Other bacterial intestinal infections
Enteropathogenic Escherichia coli infection
Enterotoxigenic Escherichia coli infection
Enteroinvasive Escherichia coli infection
Enterohaemorrhagic Escherichia coli infection
Other intestinal Escherichia coli infections
Campylobacter enteritis
Enteritis due to Yersinia enterocolitica
Enterocol

Polycystic ovarian syndrome
Primary ovarian failure
Other ovarian dysfunction
Ovarian dysfunction, unspecified
Testicular dysfunction
Testicular hyperfunction
Testicular hypofunction
Other testicular dysfunction
Testicular dysfunction, unspecified
Disorders of puberty, not elsewhere classified
Delayed puberty
Precocious puberty
Other disorders of puberty
Disorder of puberty, unspecified
Polyglandular dysfunction
Autoimmune polyglandular failure
Polyglandular hyperfunction
Other polyglandular dysfunction
Polyglandular dysfunction, unspecified
Diseases of thymus
Persistent hyperplasia of thymus
Abscess of thymus
Other diseases of thymus
Disease of thymus, unspecified
Other endocrine disorders
Carcinoid syndrome
Other hypersecretion of intestinal hormones
Ectopic hormone secretion, not elsewhere classified
Short stature, not elsewhere classified
Constitutional tall stature
Androgen resistance syndrome
Other specified endocrine disorders
Endocrine disorder, unspecified
Disorders of endocri

Peripheral retinal degeneration
Hereditary retinal dystrophy
Retinal haemorrhage
Separation of retinal layers
Other specified retinal disorders
Retinal disorder, unspecified
Retinal disorders in diseases classified elsewhere
Diabetic retinopathy
Other retinal disorders in diseases classified elsewhere
Glaucoma
Glaucoma
Glaucoma suspect
Primary open-angle glaucoma
Primary angle-closure glaucoma
Glaucoma secondary to eye trauma
Glaucoma secondary to eye inflammation
Glaucoma secondary to other eye disorders
Glaucoma secondary to drugs
Other glaucoma
Glaucoma, unspecified
Glaucoma in diseases classified elsewhere
Glaucoma in endocrine, nutritional and metabolic diseases
Glaucoma in other diseases classified elsewhere
Disorders of vitreous body
Disorders of vitreous body and globe
Vitreous prolapse
Vitreous haemorrhage
Crystalline deposits in vitreous body
Other vitreous opacities
Other disorders of vitreous body
Disorder of vitreous body, unspecified
Disorders of globe
Purulent endophthal

Toxic liver disease with hepatic necrosis
Toxic liver disease with acute hepatitis
Toxic liver disease with chronic persistent hepatitis
Toxic liver disease with chronic lobular hepatitis
Toxic liver disease with chronic active hepatitis
Toxic liver disease with hepatitis, not elsewhere classified
Toxic liver disease with fibrosis and cirrhosis of liver
Toxic liver disease with other disorders of liver
Toxic liver disease, unspecified
Hepatic failure, not elsewhere classified
Acute and subacute hepatic failure
Chronic hepatic failure
Hepatic failure, unspecified
Chronic hepatitis, not elsewhere classified
Chronic persistent hepatitis, not elsewhere classified
Chronic lobular hepatitis, not elsewhere classified
Chronic active hepatitis, not elsewhere classified
Other chronic hepatitis, not elsewhere classified
Chronic hepatitis, unspecified
Fibrosis and cirrhosis of liver
Hepatic fibrosis
Hepatic sclerosis
Hepatic fibrosis with hepatic sclerosis
Primary biliary cirrhosis
Secondary bilia

Other urethral disorders in diseases classified elsewhere
Other disorders of urinary system
Urinary tract infection, site not specified
Persistent proteinuria, unspecified
Orthostatic proteinuria, unspecified
Stress incontinence
Other specified urinary incontinence
Other specified disorders of urinary system
Disorder of urinary system, unspecified
Hyperplasia of prostate
Diseases of male genital organs
Inflammatory diseases of prostate
Acute prostatitis
Chronic prostatitis
Abscess of prostate
Prostatocystitis
Other inflammatory diseases of prostate
Inflammatory disease of prostate, unspecified
Other disorders of prostate
Calculus of prostate
Congestion and haemorrhage of prostate
Atrophy of prostate
Dysplasia of prostate
Other specified disorders of prostate
Disorder of prostate, unspecified
Hydrocele and spermatocele
Encysted hydrocele
Infected hydrocele
Other hydrocele
Hydrocele, unspecified
Spermatocele
Torsion of testis
Orchitis and epididymitis
Orchitis, epididymitis and epididymo

Indeterminate sex and pseudohermaphroditism
Hermaphroditism, not elsewhere classified
Male pseudohermaphroditism, not elsewhere classified
Female pseudohermaphroditism, not elsewhere classified
Pseudohermaphroditism, unspecified
Indeterminate sex, unspecified
Renal agenesis and other reduction defects of kidney
Congenital malformations of the urinary system
Renal agenesis, unilateral
Renal agenesis, bilateral
Renal agenesis, unspecified
Renal hypoplasia, unilateral
Renal hypoplasia, bilateral
Renal hypoplasia, unspecified
Potter syndrome
Cystic kidney disease
Congenital single renal cyst
Polycystic kidney, autosomal recessive
Polycystic kidney, autosomal dominant
Polycystic kidney, unspecified
Renal dysplasia
Medullary cystic kidney
Other cystic kidney diseases
Cystic kidney disease, unspecified
Congenital obstructive defects of renal pelvis and congenital malformations of ureter
Congenital hydronephrosis
Atresia and stenosis of ureter
Congenital megaloureter
Other obstructive defects 

Crushing injuries involving thorax with abdomen, lower back and pelvis
Crushing injuries involving multiple regions of upper limb(s)
Crushing injuries involving multiple regions of lower limb(s)
Crushing injuries involving multiple regions of upper limb(s) with lower limb(s)
Crushing injuries of thorax with abdomen, lower back and pelvis with limb(s)
Crushing injuries involving other combinations of body regions
Multiple crushing injuries, unspecified
Traumatic amputations involving multiple body regions
Traumatic amputation of both hands
Traumatic amputation of one hand and other arm [any level, except hand]
Traumatic amputation of both arms [any level]
Traumatic amputation of both feet
Traumatic amputation of one foot and other leg [any level, except foot]
Traumatic amputation of both legs [any level]
Traumatic amputation of upper and lower limbs, any combination [any level]
Traumatic amputations involving other combinations of body regions
Multiple traumatic amputations, unspecified

Accident to watercraft causing other injury : merchant ship
Accident to watercraft causing other injury : passenger ship
Accident to watercraft causing other injury : fishing boat
Accident to watercraft causing other injury : other powered watercraft
Accident to watercraft causing other injury : sailboat
Accident to watercraft causing other injury : canoe or kayak
Accident to watercraft causing other injury : inflatable craft (nonpowered)
Accident to watercraft causing other injury : water-skis
Accident to watercraft causing other injury : other unpowered watercraft
Accident to watercraft causing other injury : unspecified watercraft
Water-transport-related drowning and submersion without accident to watercraft
Water-transport-related drowning and submersion without accident to watercraft : merchant ship
Water-transport-related drowning and submersion without accident to watercraft : passenger ship
Water-transport-related drowning and submersion without accident to watercraft : fishing

In [10]:
t = time.time()
icd_descs = [x.lower().split(' ') for x in icd_descs]
model.build_vocab(icd_descs, progress_per=10000)
print(f"Build Vocab: {time.time()-t} sec")

Build Vocab: 1.1145052909851074 sec


In [11]:
icd_descs[2]

['cholera', 'due', 'to', 'vibrio', 'cholerae', '01,', 'biovar', 'cholerae']

In [12]:
t = time.time()
model.train(icd_descs, total_examples=model.corpus_count, epochs=30, report_delay=1)
print(f"Train Model: {time.time()-t} sec")
# Freeze model, init_sims() will make model more memory-efficient
model.init_sims(replace=True)

Train Model: 1.5491533279418945 sec


In [13]:
my_dict = dict({})
for idx, key in enumerate(model.wv.vocab):
    my_dict[key] = model.wv[key]

In [14]:
my_dict

{'cholera': array([-0.14510572, -0.01136373, -0.04619236,  0.21998096, -0.03164563,
        -0.06615528,  0.29427335,  0.14838375, -0.01352347, -0.10557207,
         0.13775815,  0.1559592 ,  0.01292409, -0.11709019,  0.13206632,
        -0.18893608, -0.02976857, -0.12195018,  0.09941003, -0.07903584,
        -0.08298874,  0.05199808,  0.16961   , -0.01658048, -0.11966089,
        -0.01027271,  0.02352996, -0.04186079,  0.17819728, -0.2409217 ,
        -0.07211303,  0.03305819,  0.08555552,  0.05886919,  0.10905011,
         0.01003917,  0.10190633, -0.00442652, -0.2729029 , -0.38387373,
         0.03425343,  0.09592792, -0.00444298, -0.01019725,  0.13864464,
         0.21105485, -0.01662347,  0.00666979, -0.10689647,  0.20207238,
         0.03922918, -0.05967723, -0.06102244,  0.09738556, -0.19797787,
        -0.06741525, -0.13350724,  0.04703782,  0.1482224 ,  0.05535753,
        -0.02245373, -0.02359452, -0.00904435,  0.13851257], dtype=float32),
 'intestinal': array([-0.16249481, -

In [15]:
df_embs = pd.read_csv(os.path.join('../icd10-data', 'embs.csv'))
print(df_embs.shape)
df_embs.head()

(12543, 65)


Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63,icdcode
0,1.055689,0.914306,1.017973,0.931304,0.999537,1.174611,1.046084,1.170472,0.972855,1.127879,...,1.053478,1.003908,0.999114,0.871874,0.720828,0.860998,0.857496,1.099006,0.886342,0
1,0.435807,0.300903,0.366733,0.223093,0.310091,0.50922,0.409593,0.498193,0.226463,0.427268,...,0.359167,0.29928,0.28866,0.211421,0.022797,0.237327,0.218211,0.469156,0.235369,1
2,0.148984,0.006543,0.165539,0.061942,0.162289,0.276317,0.195497,0.305104,0.09475,0.187104,...,0.209026,0.14083,0.105382,0.010476,-0.085514,-0.011004,0.037833,0.181327,0.020905,2
3,0.149412,0.007692,0.165528,0.060516,0.162734,0.276665,0.194766,0.305583,0.094197,0.187887,...,0.208381,0.140821,0.105508,0.010038,-0.087252,-0.010729,0.036823,0.181767,0.022035,3
4,0.560638,0.438192,0.519286,0.409643,0.495455,0.631107,0.576427,0.665163,0.51384,0.656934,...,0.51747,0.545916,0.477654,0.402325,0.220775,0.330195,0.379647,0.596032,0.443233,4


In [16]:
import pickle
with open(os.path.join('../icd10-data', 'encdmapper.pickle'),'rb') as input_file:
    mapper = pickle.load(input_file)
mapper

{'A00': 0,
 'A00-A09': 1,
 'A00.0': 2,
 'A00.1': 3,
 'A00.9': 4,
 'A01': 5,
 'A01.0': 6,
 'A01.1': 7,
 'A01.2': 8,
 'A01.3': 9,
 'A01.4': 10,
 'A02': 11,
 'A02.0': 12,
 'A02.1': 13,
 'A02.2': 14,
 'A02.8': 15,
 'A02.9': 16,
 'A03': 17,
 'A03.0': 18,
 'A03.1': 19,
 'A03.2': 20,
 'A03.3': 21,
 'A03.8': 22,
 'A03.9': 23,
 'A04': 24,
 'A04.0': 25,
 'A04.1': 26,
 'A04.2': 27,
 'A04.3': 28,
 'A04.4': 29,
 'A04.5': 30,
 'A04.6': 31,
 'A04.7': 32,
 'A04.8': 33,
 'A04.9': 34,
 'A05': 35,
 'A05.0': 36,
 'A05.1': 37,
 'A05.2': 38,
 'A05.3': 39,
 'A05.4': 40,
 'A05.8': 41,
 'A05.9': 42,
 'A06': 43,
 'A06.0': 44,
 'A06.1': 45,
 'A06.2': 46,
 'A06.3': 47,
 'A06.4': 48,
 'A06.5': 49,
 'A06.6': 50,
 'A06.7': 51,
 'A06.8': 52,
 'A06.9': 53,
 'A07': 54,
 'A07.0': 55,
 'A07.1': 56,
 'A07.2': 57,
 'A07.3': 58,
 'A07.8': 59,
 'A07.9': 60,
 'A08': 61,
 'A08.0': 62,
 'A08.1': 63,
 'A08.2': 64,
 'A08.3': 65,
 'A08.4': 66,
 'A08.5': 67,
 'A09': 68,
 'A09.0': 69,
 'A09.9': 70,
 'A15': 71,
 'A15-A19': 72,
 'A15.

In [17]:
mapper['root']

12542

In [23]:
df_edges = pd.read_csv(os.path.join('../icd10-data', 'edgelist.csv'))
df_edges.head()

Unnamed: 0,parent,child
0,12542,3797
1,3797,1
2,1,0
3,0,2
4,0,3


In [24]:
df_edges['child'].max()

12541

In [19]:
mapper_inv = {v:k for k,v in mapper.items()}

In [20]:
df_edges['child'] = df_edges['child'].map(mapper_inv)
df_edges['parent'] = df_edges['parent'].map(mapper_inv)
df_edges.head()

Unnamed: 0,parent,child
0,root,I
1,I,A00-A09
2,A00-A09,A00
3,A00,A00.0
4,A00,A00.1


In [21]:
df_edges['child'].unique().shape

(12542,)

In [22]:
df_lab = pd.read_csv(os.path.join('../icd10-data', 'july_2021_icd10_labcodelist.csv'))

FileNotFoundError: [Errno 2] No such file or directory: '../icd10-data/july_2021_icd10_labcodelist.csv'

In [None]:
mapper

In [None]:
df_lab.head()

In [None]:
df_lab.loc[(df_lab['ICD-10'].str.contains('A030'))]

In [None]:
df_lab['RESOLUTION CODE'].value_counts()

In [None]:
df_lab.loc[(df_lab['RESOLUTION CODE']=='2')]

In [None]:
def get_prefix(X, Y):
    idx = 0
    for x, y in zip(X, Y):
        if x==y:
            idx += 1
        else:
            break
    return idx

def split_icdcodes(icdcodes):
    if '-' in icdcodes:
        [first_icd, last_icd] = icdcodes.split('-')
        pre_idx = get_prefix(first_icd, last_icd)

        prefix00 = first_icd[:pre_idx]
        prefix01 = first_icd[:pre_idx]
        assert prefix00==prefix01

        first_suffix = int(first_icd[pre_idx:])
        last_suffix = int(last_icd[pre_idx:])

        split_icdcodes = [f"{prefix00}{i}" for i in range(first_suffix, last_suffix+1)]
        return split_icdcodes
    else:
        return [icdcodes]

results = split_icdcodes('A4150-A4153')
results

In [None]:
df_x = df_lab.copy()
df_x['ICD-10_v2'] = df_x['ICD-10'].str.replace('*', '')
df_x['ICD-10_v2'] = df_x['ICD-10_v2'].apply(split_icdcodes)

In [None]:
df_x = df_x[['ICD-10_v2', 'RESOLUTION CODE']]
df_x = df_x.explode('ICD-10_v2')
df_x

In [None]:
df_x.loc[(df_x['ICD-10_v2'].isin(mapper2.keys())), 'RESOLUTION CODE'].value_counts()

In [None]:
12542 - 5360 - 2800 - 576

In [None]:
mapper2 = {k.replace('.', ''):v for k,v in mapper.items()}

In [None]:
mapper2

In [None]:
df_lab.loc[(df_lab['ICD-10'].str.contains('-')), 'ICD-10'].tolist()

In [None]:
with open(,'rb') as input_file:
    mapper = pickle.load(input_file)
arr = [[k, v] for k, v in mapper.items()]
arr

In [34]:
df_lbl = pd.read_csv(os.path.join('../icd10-data', 'lbls.csv'))
print(df_lbl.shape)

(12651, 2)


In [35]:
df_lbl.sort_values(by=['ICD-10'],inplace=True)
df_lbl.reset_index(drop=True, inplace=True)
df_lbl

Unnamed: 0,ICD-10,RESOLUTION CODE
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
12646,12538,-1
12647,12539,-1
12648,12540,-1
12649,12541,-1


In [38]:
df_lbl.loc[(df_lbl['ICD-10']==5568)]

Unnamed: 0,ICD-10,RESOLUTION CODE
5656,5568,3
5657,5568,1


In [39]:
mapper_inv[5568]

'M13.0'

In [37]:
df_lbl['ICD-10'].value_counts()

5568    2
5984    2
5399    2
2172    2
6464    2
       ..
4719    1
6766    1
621     1
2668    1
0       1
Name: ICD-10, Length: 12543, dtype: int64

In [None]:
df_lbl.sort_values(by=['ICD-10'])