In [2]:
import pandas as pd
import pymongo

In [4]:
db = pymongo.MongoClient().scraper_meta

In [67]:
illumina_platforms = [p['accession'] for p in db.platforms.find({'manufacturer': 'Illumina Inc.', 'organism': 'Homo sapiens'}, 
                                       {'_id':0, 'accession': 1})]
print(len(illumina_platforms))
illumina_platforms[:3]

54


['GPL2507', 'GPL2700', 'GPL3896']

In [68]:
res = list(db.series.find({'platforms': {'$in': illumina_platforms}}, {'_id':0, 'accession': 1, 'supplementary_files': 1}))

In [69]:
res[:10]

[{'accession': 'GSE21715',
  'supplementary_files': [{'name': 'GSE21715_non-normalized.txt.gz',
    'type': 'TXT'}]},
 {'accession': 'GSE33221',
  'supplementary_files': [{'name': 'GSE33221_RAW.tar',
    'type': 'TAR (of BEDGRAPH)'}]},
 {'accession': 'GSE33220',
  'supplementary_files': [{'name': 'GSE33220_RAW.tar', 'type': 'TAR'},
   {'name': 'GSE33220_sample_gene_report_gpl10399.txt.gz', 'type': 'TXT'},
   {'name': 'GSE33220_sample_probe_report_gpl10558.txt.gz', 'type': 'TXT'},
   {'name': 'GSE33220_sample_probe_report_gpl6947.txt.gz', 'type': 'TXT'}]},
 {'accession': 'GSE54267',
  'supplementary_files': [{'name': 'GSE54267_RAW.tar', 'type': 'TAR'},
   {'name': 'GSE54267_non_normalized_set4.txt.gz', 'type': 'TXT'}]},
 {'accession': 'GSE54268',
  'supplementary_files': [{'name': 'GSE54268_RAW.tar', 'type': 'TAR'},
   {'name': 'GSE54268_non_normalized_set5.txt.gz', 'type': 'TXT'}]},
 {'accession': 'GSE54269',
  'supplementary_files': [{'name': 'GSE54269_RAW.tar', 'type': 'TAR'}]},
 {'a

In [70]:
filenames = [ (s['accession'], f['name'], f['type']) for s in res for f in s['supplementary_files']]

In [71]:
filenames_df = pd.DataFrame.from_records(filenames, columns=['accession', 'name', 'type'])
filenames_df.head()

Unnamed: 0,accession,name,type
0,GSE21715,GSE21715_non-normalized.txt.gz,TXT
1,GSE33221,GSE33221_RAW.tar,TAR (of BEDGRAPH)
2,GSE33220,GSE33220_RAW.tar,TAR
3,GSE33220,GSE33220_sample_gene_report_gpl10399.txt.gz,TXT
4,GSE33220,GSE33220_sample_probe_report_gpl10558.txt.gz,TXT


In [12]:
def scrap_type(name):
    t, *other = name.split('_')
    return '_'.join(other)

In [72]:
filenames_df['suffix'] = filenames_df.name.map(scrap_type)
filenames_df.head()

Unnamed: 0,accession,name,type,suffix
0,GSE21715,GSE21715_non-normalized.txt.gz,TXT,non-normalized.txt.gz
1,GSE33221,GSE33221_RAW.tar,TAR (of BEDGRAPH),RAW.tar
2,GSE33220,GSE33220_RAW.tar,TAR,RAW.tar
3,GSE33220,GSE33220_sample_gene_report_gpl10399.txt.gz,TXT,sample_gene_report_gpl10399.txt.gz
4,GSE33220,GSE33220_sample_probe_report_gpl10558.txt.gz,TXT,sample_probe_report_gpl10558.txt.gz


In [73]:
filenames_df.type.value_counts().to_frame()

Unnamed: 0,0
TXT,2346
TAR,1893
TAR (of TXT),176
SDF,50
TAR (of IDAT),48
TAR (of CEL),21
SRA Study,17
XLS,17
TAR (of BED),16
CSV,16


In [74]:
filenames_df.suffix.value_counts().to_frame()

Unnamed: 0,0
RAW.tar,2256
non-normalized.txt.gz,1063
non_normalized.txt.gz,221
non-normalized_data.txt.gz,208
normalized.txt.gz,56
Non-normalized_data.txt.gz,37
,29
raw_data.txt.gz,22
nonorm_nobkgd.txt.gz,10
raw.txt.gz,9


In [76]:
filenames_df[filenames_df.suffix.str.replace('-', '_').str.lower().isin(['non_normalized.txt.gz', 
                                                                         'non_normalized_data.txt.gz'])]

Unnamed: 0,accession,name,type,suffix
0,GSE21715,GSE21715_non-normalized.txt.gz,TXT,non-normalized.txt.gz
11,GSE54293,GSE54293_Non_normalized_data.txt.gz,TXT,Non_normalized_data.txt.gz
14,GSE54326,GSE54326_non_normalized.txt.gz,TXT,non_normalized.txt.gz
16,GSE54350,GSE54350_non-normalized.txt.gz,TXT,non-normalized.txt.gz
17,GSE54400,GSE54400_Non-normalized_data.txt.gz,TXT,Non-normalized_data.txt.gz
20,GSE54465,GSE54465_non-normalized.txt.gz,TXT,non-normalized.txt.gz
22,GSE54480,GSE54480_non-normalized.txt.gz,TXT,non-normalized.txt.gz
24,GSE54493,GSE54493_non-normalized.txt.gz,TXT,non-normalized.txt.gz
28,GSE54608,GSE54608_Non-normalized_data.txt.gz,TXT,Non-normalized_data.txt.gz
31,GSE54617,GSE54617_non_normalized.txt.gz,TXT,non_normalized.txt.gz


In [53]:
url_tpl = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}nnn/{accession}/suppl/{file}'

In [29]:
'GSE55873'[:-3]

'GSE55'

In [33]:
! mkdir -p ../data/preproc/cache/

In [51]:
from os.path import exists, join
import requests as r 
# import requests_ftp

# requests_ftp.monkeypatch_session()
download_dir = '../data/preproc/cache/'
import urllib 
import urllib.request


    
def download_suppl(accession, file):
    url = url_tpl.format(
        prefix=accession[:-3],
        accession=accession,
        file=file
    )
    
    path = join(download_dir, file)
    if not exists(path):     
        with urllib.request.urlopen(url) as response, open(path, 'wb') as out_file:
            data = response.read()
            out_file.write(data)
#         res = r.get(url, stream=True)
#         if res.status_code == 200:
#             with open(path, 'wb') as f:
#                 r.raw.decode_content = True
#                 shutil.copyfileobj(r.raw, f)    
                
    return path

    

In [77]:
suppls = filenames_df[filenames_df.suffix.str.replace('-', '_').str.lower().isin(['non_normalized.txt.gz', 
                                                                         'non_normalized_data.txt.gz'])].copy()

In [64]:
accession = 'GSE24896'
name = suppls[suppls.accession == accession].name.iloc[0]
print(name)

p = download_suppl(accession, name)
print(p)
pd.read_table(p, quotechar="#")

GSE24896_non-normalized.txt.gz
../data/preproc/cache/GSE24896_non-normalized.txt.gz


Unnamed: 0,The identifiers in the ID_REF column must match the identifiers in the ID column of the referenced platform (GPLxxxx). The Matrix table should include non-normalized signal count data and detectip p-value,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,"""# Values that should be disregarded may eithe...",,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,ID_REF,ID_REF,siCont_1,Detection Pval,siCont_2,Detection Pval,siCont_3,Detection Pval,siBAP1_1,Detection Pval,siBAP1_2,Detection Pval,siBAP1_3,Detection Pval
3,ILMN_6450255,ILMN_1762337,160.2,0.2013,198.6,0.03117,185.5,0.05974,146.7,0.46104,173.8,0.04156,163.7,0.25844
4,ILMN_2570615,ILMN_2055271,187.7,0.02727,178.4,0.1,168,0.18961,152.6,0.38052,162.9,0.08442,192,0.04935
5,ILMN_6370619,ILMN_1736007,133.7,0.48312,133.5,0.58312,140.4,0.48571,127.7,0.62727,140.1,0.3039,134.7,0.56623
6,ILMN_2600039,ILMN_2383229,138.4,0.43766,180.2,0.08182,166.7,0.20649,166.1,0.23506,144,0.26364,162,0.27143
7,ILMN_2650615,ILMN_1806310,127.8,0.55195,133,0.58312,168.1,0.18831,145.5,0.47143,112.9,0.64156,146.1,0.46883
8,ILMN_5340672,ILMN_1779670,90.2,0.80649,89.8,0.85584,92,0.77662,101.8,0.77532,103.3,0.7026,102.9,0.73506
9,ILMN_2000519,ILMN_1653355,122.2,0.62078,163.7,0.22597,181.7,0.08312,129.4,0.61818,172.3,0.04935,184.9,0.08312


In [79]:
suppls.to_pickle('../data/preproc/intermediate/illumina_platforms_suppls.v1.pickle')

In [80]:
!scp ../data/preproc/intermediate/illumina_platforms_suppls.v1.pickle npryanichnikov@ui2.computing.kiae.ru:ls2/preproc/tmp

illumina_platforms_suppls.v1.pickle             0%    0     0.0KB/s   --:-- ETAillumina_platforms_suppls.v1.pickle           100%  126KB 126.1KB/s   00:00    


In [83]:
?bytes.decode