In [1]:
import sys
sys.path.append('..')

In [42]:
import requests as r
import pandas as pd

from lib.classification.elastic import search_term, collapse_matches
from lib.obo import read_ontology
from elasticsearch import Elasticsearch
es = Elasticsearch()


In [2]:
diseases = r.get('http://amp.pharm.mssm.edu/L1000CDS2/diseases').json()

In [9]:
data = pd.DataFrame(diseases)
data[:10]

Unnamed: 0,_id,desc,term
0,552849170bdc500e55660240,Adrenal gland,Cushing syndrome_gse4060
1,552849170bdc500e55660241,Hepatic Tissue,Hepatic lipidosis_gse5538
2,552849170bdc500e55660242,Muscle - Striated (Skeletal) (MMHCC),"Melas - mitochondrial myopathy, encephalopathy..."
3,552849170bdc500e55660243,Sciatic Nerve,Peripheral motor neuropathy_gse1947
4,552849170bdc500e55660244,CNS - Brain - Cerebellum (MMHCC),Down syndrome_gse1611
5,552849170bdc500e55660245,frontal cortex,Bipolar disorder_gse5388
6,552849170bdc500e55660246,Endothelial cell,Sickle cell anemia_gse9877
7,552849170bdc500e55660247,Synovial Membrane,Ra (rheumatoid arthritis)_gse1919
8,552849170bdc500e55660248,CNS - Brain - Hippocampus (MMHCC),Anxiety disorder_gse932
9,552849170bdc500e55660249,Muscle - Striated (Skeletal) (MMHCC),Senescence_gse1786


In [10]:
data['series'] = data['term'].map(lambda t: t.split('_')[1])
data['tissue'] = data['desc']
data['disease'] = data['term'].map(lambda t: t.split('_')[0])

In [11]:
data = data[['series', 'tissue', 'disease']]
data[:10]

Unnamed: 0,series,tissue,disease
0,gse4060,Adrenal gland,Cushing syndrome
1,gse5538,Hepatic Tissue,Hepatic lipidosis
2,gse1462,Muscle - Striated (Skeletal) (MMHCC),"Melas - mitochondrial myopathy, encephalopathy..."
3,gse1947,Sciatic Nerve,Peripheral motor neuropathy
4,gse1611,CNS - Brain - Cerebellum (MMHCC),Down syndrome
5,gse5388,frontal cortex,Bipolar disorder
6,gse9877,Endothelial cell,Sickle cell anemia
7,gse1919,Synovial Membrane,Ra (rheumatoid arthritis)
8,gse932,CNS - Brain - Hippocampus (MMHCC),Anxiety disorder
9,gse1786,Muscle - Striated (Skeletal) (MMHCC),Senescence


In [15]:
import qgrid
qgrid.show_grid(pd.DataFrame(dict(tissue=data.tissue.value_counts())))

In [17]:
data.tissue.unique().shape

(432,)

In [21]:
data[data.tissue.map(lambda t: 'MMHCC' in t and 'CNS' in t)]

Unnamed: 0,series,tissue,disease
4,gse1611,CNS - Brain - Cerebellum (MMHCC),Down syndrome
8,gse932,CNS - Brain - Hippocampus (MMHCC),Anxiety disorder
11,gse2223,CNS - Brain (MMHCC),Oligodendroglioma
18,gse3075,CNS - Spinal Cord (MMHCC),"Spinal muscular atrophy, infantile"
44,gse3531,CNS - Brain - Hippocampus (MMHCC),Senescence
75,gse2866,CNS - Brain - Cerebellum (MMHCC),Gamma-hydroxybutyric acidaemia
78,gse1481,CNS - Spinal Cord (MMHCC),Anterior horn cell disease
79,gse2739,CNS - Brain (MMHCC),Hypertension
81,gse4130,CNS - Brain - Hypothalamus (MMHCC),Dehydration
82,gse18803,CNS - Spinal Cord (MMHCC),Neurological pain disorder


In [38]:
data['tissue_cl'] = data['tissue'].map(lambda t: t
                                       .lower()
                                       .replace('mmhcc', '')
                                       .replace('cns', '')
                                       .replace(' - ', ' ')
                                       .replace('(', '')
                                       .replace(')', '')
                                       .replace('tissue', '')
                                       .strip() 
                  )

In [39]:
qgrid.show_grid(data)

In [35]:
?search_term

In [37]:
search_term(
           es=es,
           index='tissue_ontology',
           term='breast' 
           )

['BTO:0000149',
 'BTO:0001428',
 'BTO:0001211',
 'BTO:0005016',
 'BTO:0000816',
 'BTO:0000150',
 'BTO:0000356',
 'BTO:0002843',
 'BTO:0002326',
 'BTO:0001912',
 'BTO:0000185',
 'BTO:0004633',
 'BTO:0002844',
 'BTO:0000186',
 'BTO:0002994',
 'BTO:0002311',
 'BTO:0004718',
 'BTO:0005437',
 'BTO:0000023']

In [43]:
ontology = read_ontology('../data/geo-annotation/brenda-tissue-ontology.obo')

Read obo graph
Name: 
Type: DiGraph
Number of nodes: 5659
Number of edges: 6430
Average in degree:   1.1362
Average out degree:   1.1362


In [40]:
data['ontology_match'] = data['tissue_cl'].map(lambda t: search_term(
           es=es,
           index='tissue_ontology',
           term=t 
           ))

In [48]:
data['ontology_match'] = data['ontology_match'].map(collapse_matches(ontology.graph))

In [54]:
def name(item_id):
    return '{} ({})'.format(ontology_name(item_id), item_id)
def ontology_name(item_id):
    if item_id not in ontology.meta:
        return item_id
    return ontology.meta[item_id].name



In [55]:
data[['tissue_cl', 'ontology_match']].assign(names=lambda d: d['ontology_match'].map(lambda d: [name(m) for m in d]))

Unnamed: 0,tissue_cl,ontology_match,names
0,adrenal gland,"[BTO:0000592, BTO:0000049, BTO:0000045]","[adrenal gland cancer cell (BTO:0000592), adre..."
1,hepatic,"[BTO:0000608, BTO:0003513, BTO:0004270, BTO:00...","[hepatoma cell (BTO:0000608), hepatic stellate..."
2,muscle striated skeletal,[],[]
3,sciatic nerve,[BTO:0001221],[sciatic nerve (BTO:0001221)]
4,brain cerebellum,[],[]
5,frontal cortex,[BTO:0000484],[frontal lobe (BTO:0000484)]
6,endothelial cell,"[BTO:0003247, BTO:0004400, BTO:0004574, BTO:00...","[baec cell (BTO:0003247), bpaec cell (BTO:0004..."
7,synovial membrane,[BTO:0001823],[synovium (BTO:0001823)]
8,brain hippocampus,[],[]
9,muscle striated skeletal,[],[]


In [57]:
(data[data['ontology_match'].map(lambda m: len(m) == 1)].tissue_cl.unique().shape, 
 data[data['ontology_match'].map(lambda m: len(m) == 0)].tissue_cl.unique().shape
 )

((49,), (293,))