In [5]:
import pandas as pd
import pymonad
from pymonad.List import List

In [6]:
import sys
sys.path.append('..')

In [7]:
from lib import utils

In [2]:
! ls -1 ../data/preproc/intermediate/illfilestat/ | wc -l

    1527


In [15]:
def read_dataset(accession):
    with open('../data/preproc/intermediate/illfilestat/'+accession) as f:
        res = []
        for line in f:
            line = line.rstrip('\n').rstrip('\r').rstrip('\t')
            if not line or line.startswith("#") or line.startswith('"# Values that should be'):
                continue
            res.append(line.split('\t'))

        return res

In [17]:
ls = read_dataset('GSE22433')
ls[:3]

[['ID_REF',
  'SAMPLE 13',
  'Detection Pval',
  'SAMPLE 14',
  'Detection Pval',
  'SAMPLE 15',
  'Detection Pval',
  'SAMPLE 16',
  'Detection Pval',
  'SAMPLE 17',
  'Detection Pval',
  'SAMPLE 18',
  'Detection Pval'],
 ['ILMN_1343291',
  '32545.97',
  '0',
  '32995.24',
  '0',
  '34307.06',
  '0',
  '29865.8',
  '0',
  '29052.86',
  '0',
  '29012.86',
  '0'],
 ['ILMN_1343295',
  '12109.73',
  '0',
  '10680.7',
  '0',
  '11345.47',
  '0',
  '17510.41',
  '0',
  '16444.12',
  '0',
  '15190.56',
  '0']]

In [20]:
import pandas as pd
import pymonad
# from pymonad.List import List
from functools import reduce
import fastnumbers

def chain_calls(functions):
    def func(args):
        return reduce((lambda args, func: func(args)), functions, args)
    return func
    

def values(ls):
    return map(lambda l: l.split('\t'), ls)


def is_float(s):
    return fastnumbers.isfloat(s)

def is_not_float(s):
    return not fastnumbers.isfloat(s)
    

def line_is_header(l):
    return all(map(is_not_float, l))

def line_is_values(start_col, l):
    return all(map(is_float, l[start_col:]))# and all(is_not_float * l[:start_col])


def n_lines_is_header(ls, nlines):
    return all(map(line_is_header, ls[:nlines]))


def rest_lines_is_values(ls, start_row, start_col):
    return all(map(lambda l: line_is_values(start_col, l), ls[start_row:]))


def col_count_match_samples(table, count, start_row, start_col, multiplyer):
    def line_match_samples(l):
        return len(l[start_col:]) == count * multiplyer
    return all(map(line_match_samples, table[start_row:]))


def gen_model(samples_count_func, header_rows, header_cols, multiplyer):
    def model(accession, table):
        count = samples_count_func(accession)
        return all([
                n_lines_is_header(table, 
                                  nlines=header_rows),
                rest_lines_is_values(table, 
                                     start_row=header_rows, 
                                     start_col=header_cols),
                col_count_match_samples(table, 
                                        count, 
                                        start_row=header_rows-1, 
                                        start_col=header_cols, 
                                        multiplyer=multiplyer),
            ])
    return ((header_rows, header_cols, multiplyer), model)


def gen_models(samples_count_func):
    return [gen_model(samples_count_func=samples_count_func,
                      header_rows=header_rows, 
                      header_cols=header_cols, 
                      multiplyer=multiplyer)
            for multiplyer in (1, 2)
            for header_rows in (1, 2)
            for header_cols in (1, 2)]


In [18]:
import pickle
with open('../data/preproc/intermediate/series_samples_count.dict.pickle', 'rb') as f:
    samples_count = pickle.load(f)

In [21]:
models = gen_models(samples_count.get)

In [36]:
import pymongo
db = pymongo.MongoClient().scraper_meta

In [22]:
accessions = !ls -1 ../data/preproc/intermediate/illfilestat/
tables = list(map(lambda acc: (acc, read_dataset(acc)), accessions))

In [23]:
%store samples_count

Stored 'samples_count' (dict)


In [25]:
%store tables

Stored 'tables' (list)


In [28]:
res = [[model_func(accession, table) for params, model_func in models]
  for accession, table in tables
]

In [29]:
params = [params for params, model_func in models]
accessions_index = [accession for accession, table in tables]

In [30]:
res_df = pd.DataFrame.from_records(res, columns=params, index=accessions_index).applymap(int)
print(res_df.shape)

(1527, 8)


In [32]:
%store res_df

Stored 'res_df' (DataFrame)


In [31]:
res_df.head()

Unnamed: 0,"(1, 1, 1)","(1, 2, 1)","(2, 1, 1)","(2, 2, 1)","(1, 1, 2)","(1, 2, 2)","(2, 1, 2)","(2, 2, 2)"
GSE15390,0,0,0,0,1,0,0,0
GSE15530,1,0,0,0,0,0,0,0
GSE15550,0,0,0,0,0,0,0,0
GSE15678,0,0,0,0,1,0,0,0
GSE16099,1,0,0,0,0,0,0,0


In [92]:
res_df[~res_df.apply(any, axis=1)].head()

Unnamed: 0,"(1, 1, 1)","(1, 2, 1)","(2, 1, 1)","(2, 2, 1)","(1, 1, 2)","(1, 2, 2)","(2, 1, 2)","(2, 2, 2)"
GSE15550,0,0,0,0,0,0,0,0
GSE16484,0,0,0,0,0,0,0,0
GSE16517,0,0,0,0,0,0,0,0
GSE17503,0,0,0,0,0,0,0,0
GSE17516,0,0,0,0,0,0,0,0


In [33]:
_t = res_df.T.sum()
# _t[_t == 0]
# res_df[_t.map(bool)]
res_df.assign(none=_t.map(lambda x: int(not x))).sum().to_frame()
# res_df.T.loc[_t[_t == 0]]
# res_df.sum().to_frame()

Unnamed: 0,0
"(1, 1, 1)",128
"(1, 2, 1)",18
"(2, 1, 1)",10
"(2, 2, 1)",1
"(1, 1, 2)",840
"(1, 2, 2)",51
"(2, 1, 2)",84
"(2, 2, 2)",7
none,388


In [100]:
res_df[res_df[(1, 1, 2)].map(bool)].head()

Unnamed: 0,"(1, 1, 1)","(1, 2, 1)","(2, 1, 1)","(2, 2, 1)","(1, 1, 2)","(1, 2, 2)","(2, 1, 2)","(2, 2, 2)"
GSE15390,0,0,0,0,1,0,0,0
GSE15678,0,0,0,0,1,0,0,0
GSE16186,0,0,0,0,1,0,0,0
GSE16421,0,0,0,0,1,0,0,0
GSE17579,0,0,0,0,1,0,0,0


In [34]:
res_df.T.sum().sum(), res_df.shape[0]

(1139, 1527)

In [58]:
series_GPL10558_GPL6947 = pd.DataFrame(list(db.series.find({'accession': {'$in': accessions}, 
                                                    'platforms': {'$in':['GPL10558', 'GPL6947']}}, {'_id':0, 'accession':1, 'platforms': 1})))

In [59]:
%store series_GPL10558_GPL6947

Stored 'series_GPL10558_GPL6947' (DataFrame)


In [60]:
series_GPL10558_GPL6947[series_GPL10558_GPL6947.platforms.map(len) == 2]

Unnamed: 0,accession,platforms
18,GSE55319,"[GPL6884, GPL10558]"
273,GSE25772,"[GPL6884, GPL10558]"
551,GSE38900,"[GPL6884, GPL10558]"
748,GSE32964,"[GPL6947, GPL6985]"
779,GSE28384,"[GPL6887, GPL6947]"
780,GSE28319,"[GPL6947, GPL9115]"
807,GSE22577,"[GPL6804, GPL6947]"
830,GSE43208,"[GPL6884, GPL6947]"
906,GSE38226,"[GPL6947, GPL8179]"


In [61]:
accessions_only_GPL10558_GPL6947 = series_GPL10558_GPL6947[series_GPL10558_GPL6947.platforms.map(len) == 1].accession.tolist()

In [45]:
res_df[res_df[(1, 1, 2)].map(bool) & res_df.index.isin(accessions_only_GPL10558)].head()

Unnamed: 0,"(1, 1, 1)","(1, 2, 1)","(2, 1, 1)","(2, 2, 1)","(1, 1, 2)","(1, 2, 2)","(2, 1, 2)","(2, 2, 2)"
GSE22427,0,0,0,0,1,0,0,0
GSE25154,0,0,0,0,1,0,0,0
GSE27124,0,0,0,0,1,0,0,0
GSE28655,0,0,0,0,1,0,0,0
GSE28656,0,0,0,0,1,0,0,0


In [46]:
series_suppls = pd.read_pickle('../data/preproc/intermediate/illumina_platforms_suppls.v1.pickle').set_index('accession')
series_suppls.head()

Unnamed: 0_level_0,name,type,suffix
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GSE21715,GSE21715_non-normalized.txt.gz,TXT,non-normalized.txt.gz
GSE54293,GSE54293_Non_normalized_data.txt.gz,TXT,Non_normalized_data.txt.gz
GSE54326,GSE54326_non_normalized.txt.gz,TXT,non_normalized.txt.gz
GSE54350,GSE54350_non-normalized.txt.gz,TXT,non-normalized.txt.gz
GSE54400,GSE54400_Non-normalized_data.txt.gz,TXT,Non-normalized_data.txt.gz


In [47]:
%store series_suppls

Stored 'series_suppls' (DataFrame)


In [63]:
series_only_GPL10558_GPL6947 = res_df[res_df.index.isin(accessions_only_GPL10558_GPL6947)].index
series_only_GPL10558_GPL6947[:5]

Index(['GSE15530', 'GSE17048', 'GSE17065', 'GSE17579', 'GSE17822'], dtype='object')

In [52]:
def task_for_series(accession, platform):
    return {
        'name': '{}_{}'.format(accession, platform),
        'type': 'preproc',
        'meta':{
            'accession': accession,
            'platform': platform,
            'samples_count': int(samples_count[accession]),
            'suppl_file': series_suppls.loc[accession]['name']
        }
    }

In [53]:
t = task_for_series('GSE22427', 'GPL10558')
t

{'meta': {'accession': 'GSE22427',
  'platform': 'GPL10558',
  'samples_count': 12,
  'suppl_file': 'GSE22427_non-normalized.txt.gz'},
 'name': 'GSE22427_GPL10558',
 'type': 'preproc'}

In [69]:
get_platform = lambda  accession: series_GPL10558_GPL6947[series_GPL10558_GPL6947.accession == accession].iloc[0].platforms[0]
# get_platform('GSE54293')
tasks = [task_for_series(series, get_platform(series)) for series in series_only_GPL10558_GPL6947]

In [70]:
utils.write_json(tasks, '../data/preproc/intermediate/ill.tasks.v4.json')

In [71]:
!scp ../data/preproc/intermediate/ill.tasks.v4.json npryanichnikov@ui2.computing.kiae.ru:ls2/preproc/tmp

ill.tasks.v4.json                               0%    0     0.0KB/s   --:-- ETAill.tasks.v4.json                             100%  160KB 160.4KB/s   00:00    


In [107]:
from shutil import copyfile

In [108]:
!mkdir ../data/preproc/intermediate/illfilestat_GPL10558

In [114]:
for index, accession in series_GPL10558.accession.iteritems():
    copyfile('../data/preproc/intermediate/illfilestat/{}'.format(accession), '../data/preproc/intermediate/illfilestat_GPL10558/{}'.format(accession))

In [72]:
res_df.query('model1 and model2')

Unnamed: 0,accession,model1,model2,model3


In [144]:
res_df.query('model2')

Unnamed: 0,accession,model1,model2,model3
5,GSE16170,False,True,True
43,GSE19274,False,True,True
112,GSE22792,False,True,True
120,GSE22955,False,True,False
126,GSE23135,False,True,True
127,GSE23136,False,True,True
128,GSE23137,False,True,True
129,GSE23138,False,True,True
130,GSE23139,False,True,True
131,GSE23289,False,True,True


In [61]:
table = values * read_clean('GSE16170')
print(first_line_is_header(table))
print(rest_lines_is_values(table, start_row=1, start_col=2))


True
True


In [53]:
table

[['ID_REF',
  'SYMBOL',
  'SAMPLE 1 (Average signal)',
  'SAMPLE 1 (p-Value)',
  'SAMPLE 2 (Average signal)',
  'SAMPLE 2 (p-Value)',
  'SAMPLE 3 (Average signal)',
  'SAMPLE 3 (p-Value)',
  'SAMPLE 4 (Average signal)',
  'SAMPLE 4 (p-Value)',
  'SAMPLE 5 (Average signal)',
  'SAMPLE 5 (p-Value)',
  'SAMPLE 6 (Average signal)',
  'SAMPLE 6 (p-Value)',
  '',
  ''],
 ['ILMN_1809034',
  '15E1.2',
  '426.9661',
  '0',
  '362.5882',
  '0',
  '474.5491',
  '0',
  '580.3371',
  '0',
  '512.84',
  '0',
  '449.0536',
  '0',
  '',
  ''],
 ['ILMN_1660305',
  "2'-PDE",
  '494.7791',
  '0',
  '302.9382',
  '0',
  '471.6671',
  '0',
  '461.2837',
  '0',
  '361.888',
  '0',
  '341.4054',
  '0',
  '',
  ''],
 ['ILMN_1762337',
  '7A5',
  '76.51974',
  '0.8328313',
  '74.14654',
  '0.876506',
  '92.10452',
  '0.2740964',
  '74.74339',
  '0.8870482',
  '83.25055',
  '0.6716868',
  '82.63262',
  '0.5798193',
  '',
  ''],
 ['ILMN_2055271',
  'A1BG',
  '117.1434',
  '0.04216867',
  '95.63122',
  '0.2048193'

In [38]:
line_is_values(1) * table[1:]

[False, False, False, False, False, False, False, False, False]

In [40]:
print(is_float * table[1][1:])
print(is_not_float * table[1][:1])

[True, True, True, True, False, False, False, False, False, False, False, False, False, False, False]
[True]
