# Create useful datasets for DO Slim -- a non-redundant set of diseases

In [1]:
import os

import pandas
import networkx

import do_tools

In [2]:
# read the disease ontology
path = os.path.join('download', 'HumanDO.obo')
do = do_tools.load_do(path)
dox = do_tools.do_to_networkx(do)


In [3]:
# read the slim disease ontology terms
path = os.path.join('data', 'slim-terms1.tsv')
slim_df = pandas.read_table(path)
slim_df[:3]

Unnamed: 0,doid,name,pathophysiology,source
0,DOID:2531,hematologic cancer,neoplastic,DOcancerslim
1,DOID:1319,brain cancer,neoplastic,DOcancerslim
2,DOID:1324,lung cancer,neoplastic,DOcancerslim


In [4]:
# check for terms that in slim but not in the DO
all_doids = set(do.get_term_ids())
slim_doids = set(slim_df.doid)
unmatched = slim_doids - all_doids
slim_doids &= all_doids
slim_df.loc[slim_df.doid.isin(unmatched)]

Unnamed: 0,doid,name,pathophysiology,source
55,DOID:9917,pleural cancer,neoplastic,DOcancerslim


In [9]:
# check for node redundancy
slim_terms = set(map(do.get_term, slim_doids))
conflicting_terms = {}
for term in slim_terms:
    nodes_to_root = networkx.descendants(dox, term)
    conflicts = {x.name for x in nodes_to_root & slim_terms}
    if conflicts:
        print(term.name, conflicts)
        conflicting_terms[term.id] = conflicts

Hodgkin's lymphoma {'hematologic cancer'}
fibrolamellar carcinoma {'liver cancer'}
hemolytic-uremic syndrome {'anemia'}
autosomal dominant nocturnal frontal lobe epilepsy {'epilepsy'}
nasopharynx carcinoma {'pharynx cancer'}
pre-eclampsia {'hypertension'}
extragonadal nonseminomatous germ cell tumor {'germ cell cancer'}
uveal melanoma {'ocular cancer'}
neuroblastoma {'peripheral nervous system neoplasm'}
benign neonatal seizures {'epilepsy'}
chordoid meningioma {'meningioma'}
chronic lymphocytic leukemia {'hematologic cancer'}
Fanconi anemia {'anemia'}
acute lymphocytic leukemia {'hematologic cancer'}
amyotrophic lateral sclerosis type 22 {'amyotrophic lateral sclerosis'}
chordoma {'bone cancer'}
essential hypertension {'hypertension'}
cutaneous T cell lymphoma {'hematologic cancer'}
age related macular degeneration {'macular degeneration'}
essential thrombocythemia {'hematologic cancer'}
embryonal rhabdomyosarcoma {'muscle cancer'}
paroxysmal nocturnal hemoglobinuria {'anemia'}
photos

Just keep the more general term

In [11]:
bad_terms = [k for k in conflicting_terms.keys()]
slim_df = slim_df.query('doid not in @bad_terms').reset_index(drop=True)

In [12]:
path = os.path.join('data', 'xrefs.tsv')
map_unprop_df = pandas.read_table(path)

path = os.path.join('data', 'xrefs-prop.tsv')
map_prop_df = pandas.read_table(path)

In [13]:
slim_df = slim_df.rename(columns={'doid': 'doid_code'})
slim_map_unprop_df = slim_df[['doid_code']].merge(map_unprop_df)
slim_map_prop_df = slim_df[['doid_code']].merge(map_prop_df)
slim_map_prop_df[:3]

Unnamed: 0,doid_code,doid_name,resource,resource_id
0,DOID:2531,hematologic cancer,CSP,2004-1600
1,DOID:2531,hematologic cancer,CSP,2004-1803
2,DOID:2531,hematologic cancer,CSP,2004-2820


In [14]:
path = os.path.join('data', 'xrefs-slim.tsv')
slim_map_unprop_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'xrefs-prop-slim.tsv')
slim_map_prop_df.to_csv(path, sep='\t', index=False)

In [15]:
rows = list()
for term in slim_terms:
    subsumed = networkx.ancestors(dox, term)
    row_part = [term.id, term.name]
    rows.append(row_part + row_part + [0])
    for subterm in subsumed:
        distance = networkx.shortest_path_length(dox, subterm, term)
        rows.append(row_part + [subterm.id, subterm.name, distance])
rows.sort()

slim_prop_df = pandas.DataFrame(rows, columns=['slim_id', 'slim_name', 'subsumed_id', 'subsumed_name', 'min_distance'])
path = os.path.join('data', 'slim-terms-prop.tsv')
slim_prop_df.to_csv(path, sep='\t', index=False)

In [18]:
! wc -l data/slim-terms1.tsv

605 data/slim-terms1.tsv
