In [39]:
import pandas as pd
import pymonad
# from pymonad.List import List
from functools import reduce
import fastnumbers

def chain_calls(functions):
    def func(args):
        return reduce((lambda args, func: func(args)), functions, args)
    return func
    

def values(ls):
    return map(lambda l: map(lambda s: s.strip('"').replace(',', '.'), l.split('\t')), ls)

def values_lists(ls):
    return list(map(list, values(ls)))

def is_float(s):
    return fastnumbers.isfloat(s)

def is_not_float(s):
    return not fastnumbers.isfloat(s)
    

def line_is_header(l):
    return all(map(is_not_float, l))

def line_is_values(start_col, l):
    return all(map(is_float, l[start_col:]))# and all(is_not_float * l[:start_col])


def n_lines_is_header(ls, nlines):
    return all(map(line_is_header, ls[:nlines]))


def rest_lines_is_values(ls, start_row, start_col):
    return all(map(lambda l: line_is_values(start_col, l), ls[start_row:]))


def col_count_match_samples(table, count, start_row, start_col, multiplyer):
    def line_match_samples(l):
        return len(l[start_col:]) == count * multiplyer
    return all(map(line_match_samples, table[start_row:]))


def conds(table, count, header_rows, header_cols, multiplyer):
    return [
                n_lines_is_header(table, 
                                  nlines=header_rows),
                rest_lines_is_values(table, 
                                     start_row=header_rows, 
                                     start_col=header_cols),
                col_count_match_samples(table, 
                                        count, 
                                        start_row=header_rows-1, 
                                        start_col=header_cols, 
                                        multiplyer=multiplyer),
            ]

def gen_model(samples_count_func, header_rows, header_cols, multiplyer):
    def model(accession, table):
        count = samples_count_func(accession)
        return all(conds(table, count, header_rows, header_cols, multiplyer))
#         return all([
#                 n_lines_is_header(table, 
#                                   nlines=header_rows),
#                 rest_lines_is_values(table, 
#                                      start_row=header_rows, 
#                                      start_col=header_cols),
#                 col_count_match_samples(table, 
#                                         count, 
#                                         start_row=header_rows-1, 
#                                         start_col=header_cols, 
#                                         multiplyer=multiplyer),
#             ])
    return ((header_rows, header_cols, multiplyer), model)


def gen_models(samples_count_func):
    return [gen_model(samples_count_func=samples_count_func,
                      header_rows=header_rows, 
                      header_cols=header_cols, 
                      multiplyer=multiplyer)
            for multiplyer in (1, 2)
            for header_rows in (1, 2)
            for header_cols in (1, 2)]

In [None]:
import pymongo
db = pymongo.MongoClient().scraper_meta

pipeline = [
    {'$unwind': '$series'},
    {'$group': {'_id': '$series', 'count': {'$sum': 1}}}
]
samples_count = dict((s['_id'], s['count']) for s in db.samples.aggregate(pipeline))

In [None]:
list(samples_count.items())[:10]

In [None]:
import pickle
with open('../data/preproc/intermediate/series_samples_count.dict.pickle', 'wb') as f:
    pickle.dump(samples_count, f)

In [3]:
import pickle
with open('../data/preproc/intermediate/series_samples_count.dict.pickle', 'rb') as f:
    samples_count = pickle.load(f)

In [4]:
series_suppls = pd.read_pickle('../data/preproc/intermediate/illumina_platforms_suppls.v1.pickle')

In [None]:
series_suppls.head()

In [5]:
models = gen_models(samples_count.get)

In [6]:
from os.path import exists, join
import requests as r 
# import requests_ftp

# requests_ftp.monkeypatch_session()
download_dir = '../data/preproc/cache/'
import urllib 
import urllib.request


url_tpl = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}nnn/{accession}/suppl/{file}'
def download_suppl(accession, file):
    url = url_tpl.format(
        prefix=accession[:-3],
        accession=accession,
        file=file
    )
    print(url)
    path = join(download_dir, file)
    if not exists(path):     
        with urllib.request.urlopen(url) as response, open(path, 'wb') as out_file:
            data = response.read()
            out_file.write(data)
#         res = r.get(url, stream=True)
#         if res.status_code == 200:
#             with open(path, 'wb') as f:
#                 r.raw.decode_content = True
#                 shutil.copyfileobj(r.raw, f)    
                
    return path

    
    

In [None]:
series_suppls.set_index('accession').loc['GSE28991'].name

In [None]:
accession = 'GSE28991'
download_suppl(accession, series_suppls.set_index('accession').loc[accession]['name'])

In [28]:
import gzip

def read_table_gz(suppl_file, header=None):
    res = []
    with gzip.open(suppl_file, 'rt') as f:
        n = 0
        for line in f:
            if header is not None and n > header:
                break
            n += 1
            line = line.rstrip('\n').rstrip('\r').rstrip('\t')#.decode('utf-8', errors='replace')
#             print("{}".format(line).replace("\t", '\\t'))
            if not line or \
               line.startswith("#") or  \
               line.startswith('"# Values that should be '):
                continue
#             print(line)
            res.append(line)
    return res

def decide_model(models, accession, suppl_file):
    header = read_table_gz(suppl_file, header=10)
    table = values_lists(header)
    return [params for params, model_func in models if model_func(accession, table)]

In [46]:
accession = 'GSE37721'
suppl_name = 'GSE37721_non-normalized.txt.gz'
suppl_file = download_suppl(accession, suppl_name)
des = decide_model(models, accession, suppl_file)
print(des)
table = values_lists(read_table_gz(suppl_file, header=10))
d = pd.DataFrame.from_records(table[1:], columns=table[0])
print(d.columns.shape)
print(samples_count.get(accession))

ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE37nnn/GSE37721/suppl/GSE37721_non-normalized.txt.gz
[]
(81,)
37


In [45]:
d.columns

Index(['PROBE_ID', 'SYMBOL', 'UMARY-604.AVG_Signal',
       'UMARY-604.Detection Pval', 'UMARY-813.AVG_Signal',
       'UMARY-813.Detection Pval', 'UMARY-819.AVG_Signal',
       'UMARY-819.Detection Pval', 'UMARY-1078.AVG_Signal',
       'UMARY-1078.Detection Pval', 
       ...
       'UMARY-1867.Detection Pval', 'UMARY-5024.AVG_Signal',
       'UMARY-5024.Detection Pval', 'UMARY-1170.AVG_Signal',
       'UMARY-1170.Detection Pval', 'SEARCH_KEY', 'ILMN_GENE', 'CHROMOSOME',
       'DEFINITION', 'SYNONYMS'],
      dtype='object', length=207)

In [41]:
conds(table, samples_count.get(accession), 1, 1, 2)

[True, True, True]

In [47]:
# pd.DataFrame.from_records(list(values((read_table_header_gz('../data/preproc/cache/GSE28991_non-normalized.txt.gz')))))
decide_model(models, 'GSE28991', '../data/preproc/cache/GSE28991_non-normalized.txt.gz')

[(1, 1, 2)]

In [49]:
pd.DataFrame.from_records(list(values((read_table_gz('../data/preproc/cache/GSE28991_non-normalized.txt.gz', header=10))))).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,ID_REF,Katja_1314_2010,Detection Pval,Katja_1315_2010,Detection Pval,Katja_630_2010,Detection Pval,Katja_673_2010,Detection Pval,Katja_790_2010,...,Katja_675_2010,Detection Pval,Katja_676_2010,Detection Pval,Katja_632_2010,Detection Pval,Katja_669_2010,Detection Pval,Katja_679_2010,Detection Pval
1,ILMN_1343291,45778.45,0,34460.26,0,33048.93,0,25716.26,0,59090.69,...,46668.02,0,43985.84,0,24805.55,0,31199.6,0,37451.81,0
2,ILMN_1343295,11766.09,0,2993.681,0,3232.279,0,2976.462,0,11475.27,...,5085.427,0,3608.203,0,1917.939,0,3377.081,0,1508.271,0
3,ILMN_1651199,-21.16069,0.7831169,-11.84853,0.6571429,-19.92676,0.7571428,1.149169,0.4792208,-15.11637,...,-19.92412,0.8025974,-19.50379,0.8051948,-12.80684,0.7090909,-2.03866,0.5324675,-29.94516,0.9311689
4,ILMN_1651209,17.5803,0.2428571,7.18391,0.3766234,-7.562912,0.6168831,-0.6398016,0.5064935,57.2438,...,21.55628,0.1493506,-11.57463,0.6779221,-8.819341,0.6428571,14.0957,0.2376623,-2.991712,0.5389611


In [82]:
import numpy as np
np.a

In [198]:
def parse_table(params, suppl_file):
    table = values_lists(read_table_gz(suppl_file))
    header_rows, header_cols, multiplyer = params
    return pd.DataFrame.from_records(data=table[header_rows:], columns=table[:header_rows][-1])

def every_n(xs, n, start=0):
    return [x for i, x in enumerate(xs) if (i+start) % n == 0]
    
def preprocess_table(params, suppl_file):
    header_rows, header_cols, multiplyer = params
    
    data = parse_table(params, suppl_file)
    data = data.set_index(data.columns[0]).astype(float)
    data = data.drop(data.columns[:header_cols - 1], axis=1)
    num_columns = data.shape[1]
    column_names = data.columns
    data.columns = list(range(data.shape[1]))
    
    if multiplyer == 2:
        pvals_columns = list(range(1, len(column_names), multiplyer))
        data_columns = list(range(0, len(column_names), multiplyer))
        
        pvals = data[pvals_columns].copy()
        data = data[data_columns].copy()
        
        data.columns = every_n(column_names, 2)
        pvals.columns = every_n(column_names, 2)
    else:
        data.columns = column_names
        pvals = pd.DataFrame(data=np.zeros(shape=data.shape), columns=column_names, index=data.index)
        
    # cleaning bad pvals
    pvals_max = pvals.apply(max, axis=1)
    data = data.drop(pvals_max[pvals_max >= 1.0].index)
    
    # correcting negative values
    min_value = data.min().min()
    data = data.applymap(lambda x: x - min_value)
    return data, pvals

In [153]:
annot_table = pd.read_table('../data/preproc/data/illuminaH12_v4_probeids_genesymbol.txt').set_index('ProbeId')
annot_table.head()

Unnamed: 0_level_0,Symbol
ProbeId,Unnamed: 1_level_1
ILMN_1653618,ZZZ3
ILMN_2137536,ZZZ3
ILMN_1786396,ZZEF1
ILMN_1701875,ZYX
ILMN_2371169,ZYX


In [154]:
annot_table.Symbol.unique().shape

(20541,)

In [193]:
def annotate_table(data, annot_table):
    return (
        np.log2(data)
        .join(annot_table.rename(columns={annot_table.columns[0]: '__GeneSymbol'}))
        .groupby('__GeneSymbol')
        .mean()
        .pow(2)
        .reset_index()
        .rename(columns={'__GeneSymbol':'GeneSymbol'})
        .set_index('GeneSymbol')
    )

In [201]:

params = decide_model(models, 'GSE28991', '../data/preproc/cache/GSE28991_non-normalized.txt.gz')[0]
data, pvals = preprocess_table((1, 1, 2), '../data/preproc/cache/GSE28991_non-normalized.txt.gz')
data.head()

Unnamed: 0_level_0,Katja_1314_2010,Katja_1315_2010,Katja_630_2010,Katja_673_2010,Katja_790_2010,Katja_791_2010,Katja_792_2010,Katja_793_2010,Katja_794_2010,Katja_795_2010,...,Katja_809_2010,Katja_810_2010,Katja_674_2010,Katja_642_2010,Katja_631_2010,Katja_675_2010,Katja_676_2010,Katja_632_2010,Katja_669_2010,Katja_679_2010
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ILMN_1343291,45840.24205,34522.05205,33110.72205,25778.05205,59152.48205,54366.93205,49915.79205,47877.39205,57385.46205,50165.15205,...,47115.26205,47878.90205,42075.50205,22402.64205,19438.57205,46729.81205,44047.63205,24867.34205,31261.39205,37513.60205
ILMN_1343295,11827.88205,3055.47305,3294.07105,3038.25405,11537.06205,15821.21205,5125.25705,10812.54205,17210.05205,6827.98705,...,9953.49805,5186.01505,2305.92705,2436.19205,1603.13005,5147.21905,3669.99505,1979.73105,3438.87305,1570.06305
ILMN_1651199,40.63136,49.94352,41.86529,62.941219,46.67568,61.859584,76.80503,58.376164,53.187126,52.266866,...,59.751181,40.67634,52.192966,44.75837,32.40766,41.86793,42.28826,48.98521,59.75339,31.84689
ILMN_1651209,79.37235,68.97596,54.229138,61.152248,119.03585,103.76603,73.0645,136.08786,108.82511,100.18388,...,97.32311,102.76038,78.72481,76.67247,49.4192,83.34833,50.21742,52.972709,75.88775,58.800338
ILMN_1651210,104.04269,69.30436,66.070309,63.137646,65.325454,60.538012,82.51323,83.18987,94.08498,78.13103,...,65.10488,60.317399,67.39706,73.14605,80.0014,62.24207,57.01102,66.606781,58.635706,64.787452


In [187]:
?pd.DataFrame.rename

In [200]:
annotate_table(data, annot_table)

Unnamed: 0_level_0,Katja_1314_2010,Detection Pval,Katja_1315_2010,Detection Pval,Katja_630_2010,Detection Pval,Katja_673_2010,Detection Pval,Katja_790_2010,Detection Pval,...,Katja_675_2010,Detection Pval,Katja_676_2010,Detection Pval,Katja_632_2010,Detection Pval,Katja_669_2010,Detection Pval,Katja_679_2010,Detection Pval
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,48.384756,46.292653,51.353163,46.260470,51.743354,46.263429,49.099859,46.279990,52.509292,46.253611,...,48.267351,46.287070,48.739002,46.291022,47.481502,46.301216,48.501463,46.285471,49.686661,46.273816
A1BG-AS1,43.572655,46.359892,44.545471,46.348055,46.293352,46.322997,43.915779,46.360347,46.123733,46.326188,...,43.880185,46.358982,46.553270,46.317071,44.844990,46.347827,46.241732,46.323909,46.426234,46.320718
A1CF,44.303880,46.347124,43.414418,46.359964,44.931877,46.333745,45.619977,46.334303,46.048612,46.323337,...,44.807538,46.342176,47.274852,46.306052,47.068387,46.308742,46.481887,46.318398,43.902005,46.360485
A2M,56.854246,46.239898,41.086762,46.383094,42.385200,46.371723,43.428070,46.365126,45.496830,46.336896,...,45.915273,46.328239,42.891712,46.374453,42.891099,46.376045,44.062709,46.362168,43.836214,46.363305
A2ML1,32.743384,46.413542,38.306815,46.400141,37.323812,46.405366,38.735090,46.405593,38.507299,46.403321,...,39.323784,46.402867,37.519045,46.409455,40.403389,46.398777,40.034158,46.401504,38.619984,46.410817
A3GALT2,45.785906,46.327521,48.739005,46.289226,49.389254,46.280454,44.632804,46.348275,47.139259,46.309661,...,47.167323,46.304073,46.230047,46.322751,47.891692,46.294718,46.403117,46.322529,47.793850,46.296753
A4GALT,40.674766,46.385822,38.678377,46.397868,40.710597,46.384004,39.375749,46.403094,39.444725,46.396505,...,43.526144,46.365808,42.768053,46.376500,43.090761,46.373770,40.461680,46.398323,42.106942,46.387186
A4GNT,50.764491,46.265268,48.646343,46.285363,52.310242,46.250643,49.659578,46.268238,49.472895,46.275317,...,45.394296,46.337579,44.545378,46.352836,40.309444,46.398777,47.396958,46.304529,49.396529,46.269151
AAAS,70.046545,46.238526,50.945027,46.262527,48.982714,46.285591,44.191893,46.356251,59.768853,46.238754,...,56.106426,46.240813,52.692086,46.245843,42.214667,46.381957,53.001614,46.244014,42.588179,46.380593
AACS,55.277770,46.242185,49.083151,46.280112,47.895170,46.299511,43.692631,46.362850,49.257438,46.277829,...,49.596362,46.266182,47.743663,46.296317,49.926800,46.263440,49.758725,46.265268,48.841506,46.275774


In [175]:
annot_table.query('Symbol == "EEF1A1"').index.tolist()

['ILMN_3251737', 'ILMN_2038774', 'ILMN_1810810', 'ILMN_1343291']

In [172]:
annot_table.loc['ILMN_1343291']

Symbol    EEF1A1
Name: ILMN_1343291, dtype: object

In [183]:
data.loc[annot_table.query('Symbol == "EEF1A1"').index.tolist()]

Unnamed: 0_level_0,Katja_1314_2010,Katja_1315_2010,Katja_630_2010,Katja_673_2010,Katja_790_2010,Katja_791_2010,Katja_792_2010,Katja_793_2010,Katja_794_2010,Katja_795_2010,...,Katja_809_2010,Katja_810_2010,Katja_674_2010,Katja_642_2010,Katja_631_2010,Katja_675_2010,Katja_676_2010,Katja_632_2010,Katja_669_2010,Katja_679_2010
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ILMN_3251737,440.11655,286.62835,219.59795,168.27745,498.73985,699.00105,495.06015,327.48065,378.04815,621.57255,...,432.13375,605.75455,350.48745,112.22683,147.97123,315.35675,344.63485,150.23719,204.52725,214.47715
ILMN_2038774,38517.15205,23845.71205,16648.09205,14608.15205,49948.90205,59326.99205,35754.58205,37652.58205,59198.30205,43455.42205,...,34811.26205,38497.18205,23609.48205,8078.01605,7069.07705,21591.99205,23395.51205,9645.93705,11798.90205,15810.08205
ILMN_1810810,43277.72205,32662.68205,24555.56205,19183.42205,53025.18205,61286.43205,45643.02205,48634.57205,59215.43205,46809.68205,...,41717.06205,48594.35205,31997.42205,11531.38205,11548.09205,32453.66205,32983.36205,14301.80205,17905.64205,23541.47205
ILMN_1343291,45840.24205,34522.05205,33110.72205,25778.05205,59152.48205,54366.93205,49915.79205,47877.39205,57385.46205,50165.15205,...,47115.26205,47878.90205,42075.50205,22402.64205,19438.57205,46729.81205,44047.63205,24867.34205,31261.39205,37513.60205


In [71]:
q = pd.DataFrame(columns=list('abc'))
q.drop(['a', 'b'], axis=1)

Unnamed: 0,c


In [127]:
data.min().min()

-111.4221

In [78]:
every_n([1, 2, 3, 4, 5, 6], 2, 2)

[1, 3, 5]

In [111]:
list(range(1, 10, 2))

[1, 3, 5, 7, 9]