In [13]:
import sys
sys.path.append('..')

In [14]:
import lib
from lib import utils

import pandas as pd
import numpy as np

from lib.obo import read_ontology

In [15]:
import pymongo
db = pymongo.MongoClient().scraper_meta

In [16]:
data = (pd.DataFrame(list(db.samples.find({
                'characteristics.tissue': {'$exists': 1},
                'organism': 'Homo sapiens'    
            }, {
                '_id': 0,
                'accession': 1,
                'characteristics': 1,
                'series': 1
            })))
        .assign(tissue=lambda data: data['characteristics'].map(lambda c: c['tissue'] if c['tissue'] else np.nan))
        .dropna()
       )

In [17]:
data.shape

(243692, 4)

In [18]:
data['tissue'].value_counts()[:10]

whole blood            20499
blood                  20239
peripheral blood       15348
liver                   7328
lung                    4391
breast                  3932
bone marrow             3776
breast tumor            3002
brain                   2730
breast cancer tumor     2636
dtype: int64

In [19]:
ontology = read_ontology('../data/geo-annotation/brenda-tissue-ontology.obo', exclude_duplicates=True, subgraph='BTO:0001489')

Read obo graph
Name: 
Type: DiGraph
Number of nodes: 5659
Number of edges: 6430
Average in degree:   1.1362
Average out degree:   1.1362


In [20]:
t = data.tissue.value_counts()
tissues = t[t > 100].index.tolist()
matched = dict((tissue, item.id) 
               for item in ontology.items()
               for tissue in tissues 
               for name in item.names() 
               if name.replace('tissue', '').strip() == tissue)

In [21]:
len(matched), len(tissues), len(ontology.items())

(63, 312, 4659)

In [22]:
series = set(utils.flatten(data[data.tissue.map(lambda t: t in tissues)].series.tolist()))

In [23]:
len(series)

3167

In [25]:
data_expanded = utils.expand(data, 'series').dropna()

In [26]:
data_expanded.to_pickle('../data/geo-annotation/geo-characteristics-tissue-expanded.py3.pickle')

In [27]:
data_expanded[:10]

Unnamed: 0,accession,characteristics,tissue,series
0,GSM15684,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
1,GSM15685,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
2,GSM15686,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
3,GSM15688,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
4,GSM15687,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
5,GSM15689,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
6,GSM15691,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
7,GSM15690,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
8,GSM15692,{'tissue': 'bronchial epithelium'},bronchial epithelium,GSE994
9,GSM15693,"{'age': '32', 'status': 'snc', 'history': 'cur...",bronchial epithelium,GSE994


In [33]:
samples_tissues = data_expanded[['accession', 'tissue']].drop_duplicates()

In [34]:
samples_tissues['oid'] = samples_tissues['tissue'].map(lambda t: matched.get(t, np.nan))
samples_tissues[:10]

Unnamed: 0,accession,tissue,oid
0,GSM15684,bronchial epithelium,BTO:0001845
1,GSM15685,bronchial epithelium,BTO:0001845
2,GSM15686,bronchial epithelium,BTO:0001845
3,GSM15688,bronchial epithelium,BTO:0001845
4,GSM15687,bronchial epithelium,BTO:0001845
5,GSM15689,bronchial epithelium,BTO:0001845
6,GSM15691,bronchial epithelium,BTO:0001845
7,GSM15690,bronchial epithelium,BTO:0001845
8,GSM15692,bronchial epithelium,BTO:0001845
9,GSM15693,bronchial epithelium,BTO:0001845


In [36]:
samples_tissues.to_pickle('../data/geo-annotation/geo-characteristics-tissue-samples.more-tissues.py3.pickle')

In [None]:
series_tissue = data_expanded[['series', 'tissue']].drop_duplicates()

In [None]:
series_tissue['oid'] = series_tissue['tissue'].map(lambda t: matched.get(t, np.nan))

In [None]:
series_tissue = series_tissue.dropna()

In [None]:
series_tissue[:10]

In [None]:
series_tissue.to_pickle('../data/geo-annotation/geo-characteristics-tissue-series.more-tissues.py3.pickle')