In [2]:
!mkdir -p ../data/preproc/intermediate/

In [8]:
!rsync -vz npryanichnikov@ui2.computing.kiae.ru:/s/ls2/groups/g0064/preproc/preproc.sqlite3 \
    ../data/preproc/intermediate/

preproc.sqlite3

sent 30,978 bytes  received 7,354 bytes  10,952.00 bytes/sec
total size is 26,576,896  speedup is 693.33


In [1]:
import pandas as pd
import numpy as np
import sqlite3


In [2]:
# con = sqlite3.connect()
from sqlalchemy import create_engine
con = create_engine('sqlite:///../data/preproc/intermediate/preproc.sqlite3')

preproc = pd.read_sql_table('task', con)

In [4]:
FRMA_PLATFORMS = ['GPL' + str(x) for x in [
    570, 571, 96,
    1261,  # organism Mus musculus
    6244,  # getting strange errors. data in matrix
    # 1355,  # NO FRMA, Rattus norvegicus,
    6246,  # no frma package (?). read.affybatch: The affy package can process data from the Gene ST 1.x series of arrays,
    # but you should consider using either the oligo or xps packages, which are specifically
    # designed for these arrays.
    # there is no package called ‘mogene10stv1frmavecs’

    90, # organism Saccharomyces cerevisiae
    3921,
    # 5175, # no frma package (?). read.affybatch: The affy package is not designed for this array type. 
    # Please use either the oligo or xps package.
    # 5188, # no cdf frma packages
    8321,  # organism Mus musculus
]]

AGLIENT_PLATFORMS = ['GPL4133', 'GPL6480', 'GPL15849', 'GPL1708', 'GPL887', 'GPL4091', 'GPL9128', 'GPL7264', 'GPL11387',
             'GPL8687', 'GPL6848', 'GPL2879', 'GPL5477', 'GPL8841', 'GPL10123', 'GPL4093', 'GPL11386', 'GPL10806',
             'GPL8269', 'GPL10150', 'GPL8583', 'GPL15931', 'GPL4126', 'GPL10152', 'GPL16050', 'GPL2567', 'GPL9053',
             'GPL14550', 'GPL5325', 'GPL10808', 'GPL13691', 'GPL9075', 'GPL8736', 'GPL885', 'GPL9777', 'GPL7504',
             'GPL8693', 'GPL2873', 'GPL17077', 'GPL10734', 'GPL13953', 'GPL13607', 'GPL13685', 'GPL7015', 'GPL15560',
             'GPL18623', 'GPL10481', 'GPL16280', 'GPL8737', 'GPL11068']

ILLUMINA_PLATFORMS = list(map('GPL{}'.format, [
   10558, 6947
]))

In [5]:
preproc['platform'] = preproc.query("type == 'preproc'").name.str.split('_').str.get(1)

In [6]:
def manufacturer(platform):
    if pd.isnull(platform):
        return 'unknown'
    if platform in AGLIENT_PLATFORMS:
        return 'agilent'
    if platform in FRMA_PLATFORMS:
        return 'affy-frma'
    if platform in ILLUMINA_PLATFORMS:
        return 'illumina'
    return 'unknown'

preproc['manufacturer'] = preproc.platform.map(manufacturer)

In [43]:
preproc[preproc.name.map(lambda n: n.startswith('GSE40550'))].iloc[0].meta

'{"samples": ["GSM996242", "GSM996243", "GSM996244", "GSM996245"], "platform": "GPL6848", "accession": "GSE40550"}'

In [7]:
import sys
sys.path.append('..')

In [8]:
from lib.annotation_api import Api
import lib.annotation_api

In [9]:
from importlib import reload
reload(lib.annotation_api)

<module 'lib.annotation_api' from '../lib/annotation_api/__init__.py'>

In [10]:
api = Api('scraper_test_dev')

In [11]:
import json
from lib.utils import flatten

converted_samples_frma = list(flatten(
    preproc
    .query("type == 'convert-merge' and status == 'done'")
    .meta
    .map(lambda m: json.loads(m).get('samples', []))
    .tolist()
))

In [12]:
def export_preprocessed(api, samples, annotated=0):
    to_export = [ (accession, dict(preprocessed=1, annotated=annotated))
        for accession in samples
    ]
    api.write_annotation(to_export)

In [13]:
len(converted_samples_frma)

190322

In [14]:
export_preprocessed(api, converted_samples_frma, annotated=1)

In [15]:
converted_samples_illumina = list((
    preproc
    .query("type == 'preproc' and status == 'done' and manufacturer == 'illumina'")
    .meta
    .map(lambda m: json.loads(m))
    .tolist()
))

In [71]:
len(converted_samples_illumina)

606

In [20]:
(
    preproc
    .query("type == 'preproc' and manufacturer == 'illumina'")
    [['platform', 'meta', 'error', 'status', 'name']]
    .head()
)

Unnamed: 0,platform,meta,error,status,name
68,GPL10558,"{""samples"": [""GSM1587800"", ""GSM1587801"", ""GSM1...",,skip,GSE65136_GPL10558
235,GPL10558,"{""samples"": [""GSM1421924"", ""GSM1421925"", ""GSM1...",,skip,GSE58893_GPL10558
1306,GPL6947,"{""samples"": [""GSM655355"", ""GSM655354"", ""GSM655...",,skip,GSE26627_GPL6947
1494,GPL6947,"{""samples"": [""GSM796536"", ""GSM796537"", ""GSM796...",,skip,GSE32140_GPL6947
2015,GPL6947,"{""samples"": [""GSM589498"", ""GSM589499"", ""GSM589...",,skip,GSE23921_GPL6947


In [21]:
converted_samples_illumina[:2]

[{'accession': 'GSE22427',
  'platform': 'GPL10558',
  'samples_count': 12,
  'suppl_file': 'GSE22427_non-normalized.txt.gz'},
 {'accession': 'GSE27124',
  'platform': 'GPL10558',
  'samples_count': 12,
  'suppl_file': 'GSE27124_non-normalized.txt.gz'}]

In [22]:
import pymongo

In [23]:
db = pymongo.MongoClient().scraper_test_dev

In [24]:
def get_samples(item):
    samples = list(db.samples.find({
            'series': item['accession'],
            'platform': item['platform']
        },{'_id': 0, 'accession': 1}))
    return [s['accession'] for s in samples]

In [25]:
get_samples({'accession': 'GSE22427',
  'platform': 'GPL10558',
  'samples_count': 12,
  'suppl_file': 'GSE22427_non-normalized.txt.gz'})

['GSM557442',
 'GSM557441',
 'GSM557440',
 'GSM557439',
 'GSM557438',
 'GSM557437',
 'GSM557436',
 'GSM557435',
 'GSM557434',
 'GSM557433',
 'GSM557432',
 'GSM557431']

In [26]:
converted_series_illumina = converted_samples_illumina

In [27]:
converted_samples_illumina = list(flatten(map(get_samples, converted_series_illumina)))

In [28]:
len(converted_samples_illumina)

20367

In [29]:
(
 preproc
    .query("type == 'preproc' and status == 'done' and manufacturer == 'illumina'")
    .meta
    .map(lambda m: int(json.loads(m)['samples_count']))
    .sum()
)

20367

In [34]:
converted_samples_illumina[:2]

['GSM557442', 'GSM557441']

In [30]:
export_preprocessed(api, converted_samples_illumina, annotated=1)

In [89]:
preproc[preproc.name.like()]

Unnamed: 0,id,name,status,meta,type,error,creation_date,submission_date,run_date,done_date,platform,manufacturer
0,1,GSE53224_GPL570,done,"{""samples"": [""GSM1287918"", ""GSM1287919"", ""GSM1...",preproc,,2015-08-21 18:46:27.696759,NaT,NaT,NaT,GPL570,affy-frma
1,2,GSE53183_GPL570,done,"{""samples"": [""GSM1286097"", ""GSM1286098"", ""GSM1...",preproc,,2015-08-21 18:46:27.698587,NaT,NaT,NaT,GPL570,affy-frma
2,3,GSE53157_GPL570,done,"{""samples"": [""GSM1283115"", ""GSM1283116"", ""GSM1...",preproc,,2015-08-21 18:46:27.699732,NaT,NaT,NaT,GPL570,affy-frma
3,4,GSE65721_GPL5082,skip,"{""samples"": [""GSM1603234"", ""GSM1603235"", ""GSM1...",preproc,,2015-08-21 18:46:27.700555,NaT,NaT,NaT,GPL5082,unknown
4,5,GSE65721_GPL570,done,"{""samples"": [""GSM1603354"", ""GSM1603355"", ""GSM1...",preproc,,2015-08-21 18:46:27.701362,NaT,NaT,NaT,GPL570,affy-frma
5,6,GSE53092_GPL570,done,"{""samples"": [""GSM1282316"", ""GSM1282317"", ""GSM1...",preproc,,2015-08-21 18:46:27.702123,NaT,NaT,NaT,GPL570,affy-frma
6,7,GSE65707_GPL570,done,"{""samples"": [""GSM1603354"", ""GSM1603355"", ""GSM1...",preproc,,2015-08-21 18:46:27.702899,NaT,NaT,NaT,GPL570,affy-frma
7,8,GSE53059_GPL570,done,"{""samples"": [""GSM1281432"", ""GSM1281433"", ""GSM1...",preproc,,2015-08-21 18:46:27.703622,NaT,NaT,NaT,GPL570,affy-frma
8,9,GSE53046_GPL570,done,"{""samples"": [""GSM1281025"", ""GSM1281026"", ""GSM1...",preproc,,2015-08-21 18:46:27.704425,NaT,NaT,NaT,GPL570,affy-frma
9,10,GSE53012_GPL570,done,"{""samples"": [""GSM1280329"", ""GSM1280330"", ""GSM1...",preproc,,2015-08-21 18:46:27.705136,NaT,NaT,NaT,GPL570,affy-frma


In [104]:
(
    preproc
    [preproc.platform.isin(['GPL6848', 'GPL6480', 'A-AGIL-28', 'GPL17077', 'GPL20163',
       'A-MEXP-2276', 'GPL16770', 'A-MEXP-2140'])]
    .groubpy('platform')
    .status.value_counts()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
platform,Unnamed: 1_level_1,Unnamed: 2_level_1
GPL17077,error,67
GPL17077,done,12
GPL6480,error,403
GPL6480,done,148
GPL6848,error,28
GPL6848,done,22


In [102]:
(
    preproc
    [preproc.platform.isin(['GPL6848', 'GPL6480', 'A-AGIL-28', 'GPL17077', 'GPL20163',
       'A-MEXP-2276', 'GPL16770', 'A-MEXP-2140'])]
    .status.value_counts()
)

error    498
done     182
dtype: int64

In [31]:
converted_samples_agilent = list((
    preproc
    .query("type == 'preproc' and status == 'done' and manufacturer == 'agilent'")
    .meta
    .map(lambda m: json.loads(m))
    .tolist()
))

In [36]:
converted_samples_agilent[:1]

[{'accession': 'GSE26863',
  'platform': 'GPL9128',
  'samples': ['GSM660764',
   'GSM660764',
   'GSM660765',
   'GSM660765',
   'GSM660766',
   'GSM660766',
   'GSM660767',
   'GSM660767',
   'GSM660768',
   'GSM660768',
   'GSM660769',
   'GSM660769',
   'GSM660770',
   'GSM660770',
   'GSM660771',
   'GSM660772',
   'GSM660772',
   'GSM660773',
   'GSM660773',
   'GSM660774',
   'GSM660774',
   'GSM660775',
   'GSM660775',
   'GSM660776',
   'GSM660776',
   'GSM660777',
   'GSM660778',
   'GSM660777',
   'GSM660778',
   'GSM660779',
   'GSM660779',
   'GSM660780',
   'GSM660780',
   'GSM660781',
   'GSM660781',
   'GSM660782',
   'GSM660782',
   'GSM660783',
   'GSM660783',
   'GSM660784',
   'GSM660771',
   'GSM660784',
   'GSM660785',
   'GSM660785',
   'GSM660786',
   'GSM660786',
   'GSM660787',
   'GSM660787',
   'GSM660788',
   'GSM660788',
   'GSM660789',
   'GSM660789',
   'GSM660790',
   'GSM660790',
   'GSM660791',
   'GSM660791',
   'GSM660792',
   'GSM660792',
   'GSM66

In [38]:
_t = [s for se in converted_samples_agilent for s in se['samples']]
export_preprocessed(api, _t, annotated=0)

In [39]:
len(_t)

24209