In [1]:
import pymongo
from elasticsearch import Elasticsearch


In [2]:
db = pymongo.MongoClient()['scraper_array-express']

In [3]:
db.series.count()

60859

In [7]:
res = db.series.aggregate([
        {'$unwind': '$platforms'},
        {'$group': {'_id': '$platforms'}}
    ])

In [9]:
res = list(res)

In [10]:
len(res)

11418

In [11]:
res[:10]

[{'_id': 'A-MTAB-542'},
 {'_id': 'GPL17'},
 {'_id': 'GPL62'},
 {'_id': 'GPL171'},
 {'_id': 'GPL174'},
 {'_id': 'A-SNGR-2'},
 {'_id': 'GPL12'},
 {'_id': 'GPL228'},
 {'_id': 'GPL263'},
 {'_id': 'A-MEXP-1'}]

In [13]:
platforms = [
    p['_id']
    for p in res
    if p['_id'].startswith('A-')
]

In [14]:
platforms[:10]

['A-MTAB-542',
 'A-SNGR-2',
 'A-MEXP-1',
 'A-MEXP-8',
 'A-SMDB-552',
 'A-SMDB-447',
 'A-SMDB-534',
 'A-MANP-2',
 'A-MEXP-42',
 'A-MEXP-18']

In [15]:
len(platforms)

2670

In [64]:
def get_geo_acc(platform):
    res = list(db.series.find({
            'accession': {'$regex': 'E-GEO.*'},
            '$and': [{'platforms': platform},
                      {'platforms': {'$size': 1}} 
                     ]
            
        }, {
                '_id': 0, 'accession': 1
            }).limit(10))
    return res if res else None


In [65]:
platforms_series = [
(platform, get_geo_acc(platform))
for platform in platforms
    ]

In [66]:
%store platforms_series

Stored 'platforms_series' (list)


In [41]:
import pandas as pd
import numpy as np

In [67]:
platforms_series_df = pd.DataFrame.from_records(
    platforms_series, columns=['platform', 'series']
).set_index('platform')

In [68]:
platforms_series_df[platforms_series_df.series.notnull()].loc['A-AFFY-44']

series    [{'accession': 'E-GEOD-71949'}, {'accession': ...
Name: A-AFFY-44, dtype: object

In [69]:
def to_geo_series(ae_series):
    if ae_series is None:
        return np.nan
    res = []
    for s in ae_series:
        t1, t2, id_ = s['accession'].split('-')
        res.append('GSE{}'.format(id_))
    return res

In [70]:
platforms_series_df['geo_series'] = platforms_series_df.series.map(to_geo_series)

In [73]:
geo_db = pymongo.MongoClient()['scraper_test_dev']
def get_geo_platform(geo_series):
    
    for s in geo_series:
        res = list(geo_db.series.find({'accession': s}, 
                           {'_id': 0, 'platforms': 1}))
        if res:
            return res[0]['platforms']
        
    return  np.nan
    

In [72]:
platforms_series_df_good = platforms_series_df[platforms_series_df.geo_series.notnull()].copy()

In [74]:
platforms_series_df_good['geo_platform'] = platforms_series_df_good.geo_series.map(get_geo_platform)

In [79]:
affy_platform_to_geo = (
    platforms_series_df_good
    [platforms_series_df_good.geo_platform.map(len) == 1]
    .geo_platform
    .map(lambda x: x[0])
    .to_dict()
)

In [80]:
%store affy_platform_to_geo

Stored 'affy_platform_to_geo' (dict)


In [82]:
%store -r affy_platform_to_geo

In [63]:
_t = platforms_series_df_good[platforms_series_df_good.geo_platform.isnull()].geo_series.tolist()
'--task '+(' --task '.join(_t))

'--task GSE64013 --task GSE62882 --task GSE17585 --task GSE59485 --task GSE52729 --task GSE51147 --task GSE63873 --task GSE70693 --task GSE66253 --task GSE68950 --task GSE62123 --task GSE70396 --task GSE60452 --task GSE69659 --task GSE68744 --task GSE68882 --task GSE64110 --task GSE69886 --task GSE64109 --task GSE67057 --task GSE62250 --task GSE64228 --task GSE42505 --task GSE63274 --task GSE68065 --task GSE61728 --task GSE21598 --task GSE43686 --task GSE67225 --task GSE57600 --task GSE63084 --task GSE60388 --task GSE66496 --task GSE70443 --task GSE57615 --task GSE69134'