In [2]:
import pandas as pd
import pymongo

db_meta = pymongo.MongoClient().scraper_meta
db_pa = pymongo.MongoClient().pharm_atlas

In [3]:
m = pd.read_pickle('./data/set_with_meta.pickle')
disease_doid = pd.DataFrame(m.groupby('disease')['doid'].first())
disease_doid[:10]


Unnamed: 0_level_0,doid
disease,Unnamed: 1_level_1
Alzheimer's Disease,DOID:10652
Barrett's esophagus,DOID:9206
Crohn's disease,DOID:8778
Crohn’s ileitis,DOID:0060189
Friedreich ataxia,DOID:12705
Her2-receptor negative breast cancer,DOID:0060080
Her2-receptor positive breast cancer,DOID:0060079
Huntington's disease,DOID:12858
Job's syndrome,DOID:3261
Kawasaki disease,DOID:13378


In [5]:
tr_set = pd.read_csv('./data/out_training_set_exp.csv', sep=';')
tr_set.columns

Index(['Unnamed: 0', 'accession', 'summary', 'title', 'overall_design',
       'disease', 'tissue', 'sample_accession', 'sample_title',
       'sample_description', 'norm_or_tumour', 'sample_source_name'],
      dtype='object')

In [6]:
tr_set.set_index('sample_accession', inplace=True)

In [69]:
PAGE_SIZE = 1000

series_by_samples = tr_set.reset_index().groupby('sample_accession')['accession'].apply(set)

db_pa.samples.remove({})
for i in range(0, tr_set.shape[0], PAGE_SIZE):
    accessions = list(tr_set.index[i: i+PAGE_SIZE])
    samples_c = db_meta.samples.find({'accession': {'$in': accessions}})
    samples_b = []
    for sample in samples_c:
        sample['disease'] = tr_set.loc[sample['accession']].disease
        sample['tissue'] = tr_set.loc[sample['accession']].tissue
        sample['series'] = list(series_by_samples.loc[sample['accession']])
        samples_b.append(sample)
    db_pa.samples.insert_many(samples_b)
    
tr_set.shape

(16511, 11)

In [8]:
db_pa.samples.count()

16511

In [71]:
PAGE_SIZE = 1000

samples_by_series = tr_set.reset_index().groupby('accession')['sample_accession'].apply(set)
db_pa.series.remove({})
series = list(tr_set['accession'].unique())
for i in range(0, len(series), PAGE_SIZE):
    accessions = series[i: i+PAGE_SIZE]
    c = list(db_meta.series.find({'accession': {'$in': accessions}}))
    bucket = []
    for s in c:
        s['samples'] = list(samples_by_series.loc[s['accession']])
        bucket.append(s)
    db_pa.series.insert_many(bucket)
    
db_pa.series.count()

304

In [72]:
samples_c = list(db_pa.samples.find())
series_c = list(db_pa.series.find())


In [11]:
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch.helpers import bulk

es = Elasticsearch()
es_i = IndicesClient(es)




In [12]:
!curl -XDELETE 'http://localhost:9200/series/'
!curl -XDELETE 'http://localhost:9200/samples/'


{"acknowledged":true}{"acknowledged":true}

In [73]:
import json
from bson import json_util

!mkdir -p data/pharm-atlas/


json.dump(samples_c, open('data/pharm-atlas/samples.json', 'w'), default=json_util.default)
json.dump(series_c, open('data/pharm-atlas/series.json', 'w'), default=json_util.default)

In [25]:
series_mapping = {
        'series': {
                'properties': {
                    'organism': {'type': 'string', "index" : "not_analyzed"},
                    'title': {'type': 'string'},
                    'accession': {'type': 'string', "index" : "not_analyzed"},
                    'summary': {'type': 'string'},
                    'overall_design': {'type': 'string'},
                    'samples': {'type': 'string', "index" : "not_analyzed"},
                    'platforms': {'type': 'string', "index" : "not_analyzed"},
                    'type': {'type': 'string', "index" : "not_analyzed"}
                }            
        }
}

samples_mapping = {
        'samples': {
                'properties': {
                    'organism': {'type': 'string', "index" : "not_analyzed"},
                    'source_name': {'type': 'string'},
                    'sample_type': {'type': 'string'},
                    'description': {'type': 'string'},
                    'disease': {'type': 'string', "index" : "not_analyzed"},
                    'platform': {'type': 'string', "index" : "not_analyzed"},
                    'series': {'type': 'string', "index" : "not_analyzed"},
                    'accession': {'type': 'string', "index" : "not_analyzed"},
                }            
        }
}
es.indices
def import_index(client, file_name, index, doc_type, mapping, delete=False):
    def to_action(_index, _type, _source):
        if '_id' in _source:
            del _source['_id']
        return {
            '_index': _index,
            '_type': _type,
            '_source': _source
        }


    if delete:
        client.indices.delete(index=index, ignore=[400, 404])
        
    client.indices.create(index=index, ignore=400)        
    
    if mapping:
        client.indices.put_mapping(index=index, doc_type=doc_type, body=mapping)
    
    collection = json.load(open(file_name))

    return bulk(client, map(lambda s: to_action(index, doc_type, s), collection))


In [28]:
import_index(es, 'data/pharm-atlas/series.json', 'series', 'series', series_mapping, delete=True)
import_index(es, 'data/pharm-atlas/samples.json', 'samples', 'samples', samples_mapping, delete=True)

(16511, [])

In [29]:
es_i.get_mapping(index='series', doc_type='series')

{'series': {'mappings': {'series': {'properties': {'accession': {'index': 'not_analyzed',
      'type': 'string'},
     'contact': {'properties': {'City': {'type': 'string'},
       'Country': {'type': 'string'},
       'Fax': {'type': 'string'},
       'Lab': {'type': 'string'},
       'Name': {'type': 'string'},
       'Organization name': {'type': 'string'},
       'Phone': {'type': 'string'},
       'State/province': {'type': 'string'},
       'Street address': {'type': 'string'},
       'Zip/postal_code': {'type': 'string'}}},
     'contributor': {'type': 'string'},
     'data_source': {'type': 'string'},
     'last_update_date': {'properties': {'$date': {'type': 'long'}}},
     'meta': {'properties': {'geo_id': {'type': 'long'}}},
     'organism': {'index': 'not_analyzed', 'type': 'string'},
     'overall_design': {'type': 'string'},
     'platforms': {'index': 'not_analyzed', 'type': 'string'},
     'relations': {'properties': {'Affiliated with': {'type': 'string'},
       'BioP

In [66]:
es.search(index='series', body={
        'query': {
            'term': {
                'organism': 'Homo sapiens'
            }
        }, 
        "fields" : ['accession']
    })

{'_shards': {'failed': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': 'AU6ttnvk7j9BsmuCK2of',
    '_index': 'series',
    '_score': 1.0165293,
    '_type': 'series',
    'fields': {'accession': ['GSE3744']}},
   {'_id': 'AU6ttnvk7j9BsmuCK2ok',
    '_index': 'series',
    '_score': 1.0165293,
    '_type': 'series',
    'fields': {'accession': ['GSE4757']}},
   {'_id': 'AU6ttnvk7j9BsmuCK2op',
    '_index': 'series',
    '_score': 1.0165293,
    '_type': 'series',
    'fields': {'accession': ['GSE6004']}},
   {'_id': 'AU6ttnvk7j9BsmuCK2ou',
    '_index': 'series',
    '_score': 1.0165293,
    '_type': 'series',
    'fields': {'accession': ['GSE7451']}},
   {'_id': 'AU6ttnvl7j9BsmuCK2ow',
    '_index': 'series',
    '_score': 1.0165293,
    '_type': 'series',
    'fields': {'accession': ['GSE7553']}},
   {'_id': 'AU6ttnvl7j9BsmuCK2o1',
    '_index': 'series',
    '_score': 1.0165293,
    '_type': 'series',
    'fields': {'accession': ['GSE8167']}},
   {'_id': 'AU6ttnvl7j9BsmuC

In [70]:
es.search(index='samples', body={
        'query': {
            'terms': {
                'series': ['GSE3744', 'GSE10810']
            }
        }, 
        "fields" : ['accession']
    })

{'_shards': {'failed': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': 'AU6ttpWO7j9BsmuCK6bV',
    '_index': 'samples',
    '_score': 2.5545564,
    '_type': 'samples',
    'fields': {'accession': ['GSM85477']}},
   {'_id': 'AU6ttpWO7j9BsmuCK6be',
    '_index': 'samples',
    '_score': 2.5545564,
    '_type': 'samples',
    'fields': {'accession': ['GSM85486']}},
   {'_id': 'AU6ttpWO7j9BsmuCK6bj',
    '_index': 'samples',
    '_score': 2.5545564,
    '_type': 'samples',
    'fields': {'accession': ['GSM85491']}},
   {'_id': 'AU6ttpWO7j9BsmuCK6bo',
    '_index': 'samples',
    '_score': 2.5545564,
    '_type': 'samples',
    'fields': {'accession': ['GSM85496']}},
   {'_id': 'AU6ttpWO7j9BsmuCK6bt',
    '_index': 'samples',
    '_score': 2.5545564,
    '_type': 'samples',
    'fields': {'accession': ['GSM85501']}},
   {'_id': 'AU6ttpWO7j9BsmuCK6by',
    '_index': 'samples',
    '_score': 2.5545564,
    '_type': 'samples',
    'fields': {'accession': ['GSM85506']}},
   {'_id':

In [75]:
es.search(index='series', body={
        'size': 0,
        "aggs" : {
            "series" : {
                "terms" : { "field" : "platforms" }
            }
        }
    })
#  , GSE10810 

{'_shards': {'failed': 0, 'successful': 5, 'total': 5},
 'aggregations': {'series': {'buckets': [{'doc_count': 304, 'key': 'GPL570'},
    {'doc_count': 3, 'key': 'GPL3720'},
    {'doc_count': 3, 'key': 'GPL96'},
    {'doc_count': 2, 'key': 'GPL1261'},
    {'doc_count': 2, 'key': 'GPL3718'},
    {'doc_count': 1, 'key': 'GPL13118'},
    {'doc_count': 1, 'key': 'GPL1355'},
    {'doc_count': 1, 'key': 'GPL4092'},
    {'doc_count': 1, 'key': 'GPL571'},
    {'doc_count': 1, 'key': 'GPL8227'}],
   'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 4}},
 'hits': {'hits': [], 'max_score': 0.0, 'total': 304},
 'timed_out': False,
 'took': 2}

In [132]:
series_field_defs = {
    'organism': {'type': 'term'},
    'title': {'type': 'match'},
    'accession': {'type': 'term'},
    'summary': {'type': 'match'},
    'overall_design': {'type': 'match'},
    'samples': {'type': 'term'},
    'platforms': {'type': 'term'},
    'type': {'type': 'term'}
}

samples_field_defs = {
    'organism': {'type': 'term'},
    'source_name': {'type': 'match'},
    'sample_type': {'type': 'match'},
    'description': {'type': 'match'},
    'disease': {'type': 'term'},
    'platform': {'type': 'term'},
    'series': {'type': 'term'},
    'accession': {'type': 'term'},
}


def query(predicates):
    series_query = []
    samples_query = []
    for c_p, value in predicates.items():
        c, p = c_p.split('.')
        
        if c == 'series':
            t = series_field_defs[p]['type']
            query = series_query
        elif c == 'samples':
            t = samples_field_defs[p]['type']
            query = samples_query
            
        if t == 'term':
            query.append({'term': {p: value}})
        elif t == 'match':
            query.append({'match': {p: value}})
        
    return series_query, samples_query


def must_query(index, doc_type, args):
#     fields = fields or []
    return  es.search(index=index, doc_type=doc_type, body={
             'query' : {'bool': {'must': args}}
        })

def accession_query(index, doc_type, args):
    return [hit.get('fields')['accession'][0] for hit in es.search(index=index, doc_type=doc_type, body={
             'query' : {'bool': {'must': args}},
             'fields': ['accession']
        })['hits']['hits'] ]


def search(predicates):
    series_query, samples_query = query(predicates)
    
    series_accessions = []
    if len(series_query) > 0:
        series_accessions = accession_query('series', 'series', series_query)
        
    if len(series_query) > 0:
        if len(series_accessions) == 0:
            return None
        samples_query.append({'terms': {'series': series_accessions}})
    
    return must_query(args=samples_query, index='samples', doc_type='samples')
        
    
search({
        'series.accession': "GPL96"
    })

In [37]:
import pprint

pprint.pprint(es.search(index='series', size=1)['hits']['hits'][0]['_source'])

{'accession': 'GSE66354',
 'contact': {'City': 'Aurora',
             'Country': 'USA',
             'Lab': 'RC1 North, P18-4401M',
             'Name': 'Andrew M Donson',
             'Organization name': 'UC Denver',
             'Phone': '303 724 4012',
             'State/province': 'CO',
             'Street address': '12800 E 19th Ave',
             'Zip/postal_code': '80010'},
 'contributor': 'Donson AM',
 'data_source': 'geo',
 'last_update_date': {'$date': 1426809600000},
 'meta': {'geo_id': 66354},
 'organism': 'Homo sapiens',
 'overall_design': 'Gene expression profiles were generated from surgical '
                   'tumor and normal brain samples (n=149) using Affymetrix '
                   'HG-U133plus2 chips (Platform GPL570).',
 'platforms': ['GPL570'],
 'pubmed_id': None,
 'relations': {'BioProject': ['http://www.ncbi.nlm.nih.gov/bioproject/PRJNA276629']},
 'scrap_date': {'$date': 1434625339319},
 'status': 'Public on Feb 27 2015',
 'submission_date': {'$date': 1424

In [43]:
%run ~/Sources/PharmAtlas/PharmAtlas/metadata/metadata.py

In [39]:
get_search_fields()

[('series', 'type'),
 ('series', 'accession'),
 ('series', 'overall_design'),
 ('series', 'title'),
 ('series', 'organism'),
 ('series', 'samples'),
 ('series', 'summary'),
 ('samples', 'series'),
 ('samples', 'accession'),
 ('samples', 'platform'),
 ('samples', 'organism'),
 ('samples', 'description'),
 ('samples', 'sample_type'),
 ('samples', 'disease'),
 ('samples', 'source_name')]

In [74]:
%run ~/Sources/PharmAtlas/PharmAtlas/metadata/rawdata.py
get_samples(['GSM496090', 'GSM496091', 'GSM496092'], '/Users/nikita/Sources/PharmAtlas/Data/raw')

Unnamed: 0_level_0,GSM496090,GSM496091,GSM496092
probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1007_s_at,303.244516,282.016937,271.893947
1053_at,66.793174,73.039329,90.101936
117_at,57.692653,63.475379,53.678112
121_at,409.035096,408.410138,335.907658
1255_g_at,11.738109,11.779567,12.484152
1294_at,79.103901,84.436455,88.219110
1316_at,43.823432,43.176210,45.818475
1320_at,19.775184,19.969685,20.068924
1405_i_at,10.796360,12.028591,10.182012
1431_at,12.744064,13.492310,13.901024
