## Сводные результаты аннотации по болезням, тканям, годам, статусу препроцессинга

In [1]:
import pandas as pd
import pymongo as pm

db = pm.MongoClient().scraper_meta

In [2]:
pd.read_csv('../data/preproc/res.series-disease.preprocessed.2.csv').head()

Unnamed: 0,series,doid,disease,samples_count
0,GSE430,DOID:6000,congestive heart failure,14
1,GSE473,DOID:2841,asthma,175
2,GSE474,DOID:9970,obesity,24
3,GSE475,DOID:3083,chronic obstructive pulmonary disease,7
4,GSE567,DOID:2228,thrombocytosis,2


In [3]:
series_disease = pd.read_pickle('../data/geo-annotation/series_disease.pickle')
# print(series_disease)
series_disease.head()

Unnamed: 0,series,doid,disease,samples_count,platforms
0,GSE3,DOID:4467,renal clear cell carcinoma,342,"[GPL9, GPL10]"
2,GSE16,DOID:1612,breast cancer,25,[GPL28]
3,GSE17,DOID:440,neuromuscular disease,2,[GPL4]
5,GSE31,DOID:161,keratosis,5,[GPL4]
9,GSE49,DOID:1612,breast cancer,10,[GPL179]


In [6]:
res_tissues = pd.read_pickle('../data/geo-annotation/series.tissue.all.res3.pickle').reset_index()
res_tissues.head()

Unnamed: 0,series,collapsed
0,GSE10,BTO:0001177
1,GSE10001,BTO:0000759
2,GSE10003,BTO:0000762
3,GSE10005,BTO:0000282
4,GSE10007,BTO:0001573


In [7]:
_t = db.series.find({
        'accession': {'$in': res_tissues.series.unique().tolist()}
    },{
        '_id': 0,
        'accession': 1,
#         'samples': 1,
        'platforms': 1
    })

series_tissue = (
    pd.DataFrame.from_records([(c['accession'], c['platforms']) for c in _t],
                             columns=['series', 'platforms'])
    .set_index('series')
    .join(res_tissues.set_index('series'))
    .reset_index()
    .rename(columns=dict(collapsed='tissue_id'))
#     .assign(samples=lambda d: d['samples'].map(len))
)
series_tissue.head()

Unnamed: 0,series,platforms,tissue_id
0,GSE10,[GPL4],BTO:0001177
1,GSE10001,[GPL6246],BTO:0000759
2,GSE10003,[GPL6104],BTO:0000762
3,GSE10005,[GPL6298],BTO:0000282
4,GSE10007,[GPL2005],BTO:0001573


In [9]:
samples_age = pd.read_pickle('../data/geo-annotation/series.age.res.1.py3.pickle')[['accession', 'age']].drop_duplicates()
samples_age.head()

Unnamed: 0,accession,age
0,GSM1000061,62
1,GSM1000062,62
2,GSM1000223,41
3,GSM1000224,48
4,GSM1000225,53


## Samples series

In [11]:
samples_series = (
    pd.DataFrame.from_records(
        [(s['accession'], series, s['platform'])
            for s in db.samples.find({}, {'_id': 0, 'accession': 1, 'series': 1, 'platform': 1})
            for series in s['series']],
        columns=['accession', 'series', 'platform']
    )
    .drop_duplicates()
)
samples_series.head()

Unnamed: 0,accession,series,platform
0,GSM1,GSE506,GPL4
1,GSM2,GSE506,GPL4
2,GSM3,GSE462,GPL5
3,GSM4,GSE462,GPL5
4,GSM5,GSE462,GPL5


In [12]:
samples_norms = pd.read_pickle('../data/geo-annotation/samples.norms.res2.pickle').drop_duplicates().merge(samples_series[['accession', 'platform']])
print(samples_norms.shape)
samples_norms.head()

(2344928, 4)


Unnamed: 0,accession,norm,series,platform
0,GSM305072,1,GSE12103,GPL6848
1,GSM305305,1,GSE12103,GPL6848
2,GSM305310,1,GSE12103,GPL6848
3,GSM1422841,1,GSE58949,GPL18883
4,GSM1422846,1,GSE58949,GPL18883


## Samples disease

In [13]:
samples_disease = pd.merge(samples_series, series_disease)[['accession', 'series', 'doid']].drop_duplicates()
samples_disease.head()

Unnamed: 0,accession,series,doid
0,GSM81,GSE3,DOID:4467
1,GSM82,GSE3,DOID:4467
2,GSM83,GSE3,DOID:4467
3,GSM84,GSE3,DOID:4467
4,GSM85,GSE3,DOID:4467


## Проверка что если семпл в разных сериях, то болезни проставлены одинаковые

In [14]:
samples_disease.shape, samples_disease.drop_duplicates().shape, samples_disease.accession.unique().shape


((422311, 3), (422311, 3), (367979,))

In [15]:
_t = (
    samples_disease
    .groupby('accession')
    .agg({'series': lambda s: set(s), 'doid': lambda s: set(s)})
)

In [16]:
samples_disease = (
    _t
    .assign(doid_len=_t.doid.map(len))
    .query('doid_len == 1')
    .assign(doid=lambda d: d['doid'].map(lambda x: list(x)[0]))
    [['doid']]
    .reset_index()
    .merge(samples_series[['accession', 'platform']].drop_duplicates())
)
print(samples_disease.shape)
samples_disease.head()

(357183, 3)


Unnamed: 0,accession,doid,platform
0,GSM100,DOID:4467,GPL9
1,GSM1000061,DOID:0050866,GPL5477
2,GSM1000062,DOID:0050866,GPL5477
3,GSM1000612,DOID:3070,GPL570
4,GSM1000613,DOID:3070,GPL570


## Samples tissue

In [17]:
samples_tissue = pd.merge(samples_series, series_tissue)[['accession', 'series', 'tissue_id']].drop_duplicates()
samples_tissue.head()

Unnamed: 0,accession,series,tissue_id
0,GSM50,GSE2,BTO:0000232
1,GSM51,GSE2,BTO:0000232
2,GSM52,GSE2,BTO:0000232
3,GSM53,GSE2,BTO:0000232
4,GSM54,GSE2,BTO:0000232


## Проверка что если семпл в разных сериях, то ткани проставлены одинаковые

In [18]:
samples_tissue.shape, samples_tissue.drop_duplicates().shape, samples_tissue.accession.unique().shape


((627508, 3), (627508, 3), (554244,))

In [19]:
_t = (
    samples_tissue
    .groupby('accession')
    .agg({'series': lambda s: set(s), 'tissue_id': lambda s: set(s)})
)

In [20]:
_t.head()

Unnamed: 0_level_0,series,tissue_id
accession,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM10000,set([GSE665]),set([BTO:0000759])
GSM1000000,"set([GSE40732, GSE40736])",set([BTO:0001025])
GSM1000001,"set([GSE40732, GSE40736])",set([BTO:0001025])
GSM1000002,"set([GSE40732, GSE40736])",set([BTO:0001025])
GSM1000003,"set([GSE40732, GSE40736])",set([BTO:0001025])


In [21]:
samples_tissue = (
    _t
    .assign(_len=_t.tissue_id.map(len))
    .query('_len == 1')
    .assign(tissue_id=lambda d: d['tissue_id'].map(lambda x: list(x)[0]))
    [['tissue_id']]
    .reset_index()
    .merge(samples_series[['accession', 'platform']].drop_duplicates())
)
print(samples_tissue.shape)
samples_tissue.head()

(550745, 3)


Unnamed: 0,accession,tissue_id,platform
0,GSM10000,BTO:0000759,GPL85
1,GSM1000000,BTO:0001025,GPL16025
2,GSM1000001,BTO:0001025,GPL16025
3,GSM1000002,BTO:0001025,GPL16025
4,GSM1000003,BTO:0001025,GPL16025


## Препроцессинг

In [22]:
series_preprocessed = pd.read_pickle('../data/preproc/series.converted.res1.pickle').assign(preprocessed=lambda d: 1).drop('platform', axis=1)
print(series_preprocessed.shape)
series_preprocessed.head()

(4801, 2)


Unnamed: 0_level_0,series,preprocessed
id,Unnamed: 1_level_1,Unnamed: 2_level_1
13843,GSE65835,1
13844,GSE53224,1
13845,GSE53183,1
13846,GSE53157,1
13847,GSE65721,1


In [23]:
samples_preprocessed = series_preprocessed.merge(samples_series[['series', 'accession', 'platform']])
print(samples_preprocessed.shape)
samples_preprocessed.head()

(178707, 4)


Unnamed: 0,series,preprocessed,accession,platform
0,GSE65835,1,GSM1607354,GPL570
1,GSE65835,1,GSM1607355,GPL570
2,GSE65835,1,GSM1607356,GPL570
3,GSE65835,1,GSM1607357,GPL570
4,GSE65835,1,GSM1607358,GPL570


## Сводная таблица

In [24]:
samples_annot = (
    samples_norms
    .merge(samples_age, how='left')
    .merge(samples_tissue, how='left')
    .merge(samples_norms, how='left')
    .merge(samples_preprocessed, how='left')
    .drop_duplicates()
)
print(samples_annot.shape)
samples_annot.head()

(1730022, 7)


Unnamed: 0,accession,norm,series,platform,age,tissue_id,preprocessed
0,GSM305072,1,GSE12103,GPL6848,,BTO:0000089,
1,GSM305305,1,GSE12103,GPL6848,,BTO:0000089,
2,GSM305310,1,GSE12103,GPL6848,,BTO:0000089,
3,GSM1422841,1,GSE58949,GPL18883,,,
4,GSM1422846,1,GSE58949,GPL18883,,,


In [69]:
samples_series.shape, samples_series.accession.unique().shape

((1730022, 3), (1430868,))

## Запись результатов

In [26]:
samples_annot.to_pickle('../data/geo-annotation/samples.annot.res6.pickle')

In [4]:
samples_annot = pd.read_pickle('../data/geo-annotation/samples.annot.res6.pickle')