<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Merge-Age-/-Sex-tables-together" data-toc-modified-id="Merge-Age-/-Sex-tables-together-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Merge Age / Sex tables together</a></span></li><li><span><a href="#Merge-Sibling-matched-tables" data-toc-modified-id="Merge-Sibling-matched-tables-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Merge Sibling matched tables</a></span></li></ul></div>

This notebook does a double check to make sure that all of the metadata has been standardized.

In [1]:
import glob
import pandas as pd
from gneiss.util import match
import numpy as np

# drop these studies, since they are lacking key data
# Note that we are going to save Kang2017 for another analysis
droplist = {'Liu2019', 'Liu2017', 'Pulikkan2018', 'Kang2017', 'Cao2021'} 
folders = glob.glob('*')
folders = list(filter(lambda x: 'ipynb' not in x, folders))
folders = list(filter(lambda x: 'Combined' not in x, folders))
folders = list(filter(lambda x: 'Benchmarking' not in x, folders))
folders = list(filter(lambda x: x not in droplist, folders))

mds = list(map(lambda x: pd.read_table(f'{x}/sample_metadata.txt'), folders))
for i in range(len(mds)):
    mds[i] = mds[i].rename(columns={mds[i].columns[0]: 'sampleid'})
    mds[i] = mds[i].set_index('sampleid')
md_dict = dict(list(zip(folders, mds)))

combined = pd.concat(mds, axis=0)
cs = combined[['Age', 'Sex', 'Status', 'Control_Type', 'Cohort', 'Subjects_Location', 'Variable_Region', 'Match_IDs']]

In [2]:
for i, md in enumerate(mds):
    print(md.Cohort[0], md.Variable_Region[0], md.Control_Type[0], sorted(list(md.Status.value_counts().index)))

Fasano2020 V45 Age_Sex_Match ['ASD', 'Control']
Chen2020 V34 Age_Sex_Match ['ASD', 'Control']
Fouquier2021 V4 Age_Sex_Match ['ASD', 'Control']
Son2015 V12 Sibling_Match ['ASD', 'Control']
David2021 V4 Sibling_Match ['ASD', 'Control']
Zurita2019 V4 Age_Sex_Match ['ASD', 'Control']
Zou2020 V34 Age_Sex_Match ['ASD', 'Control']
Dan2020 V4 Age_Sex_Match ['ASD', 'Control']
Berding2020 V34 Age_Sex_Match ['ASD', 'Control']


# Merge Age / Sex tables together

In [3]:
from biom import load_table
bioms = list(map(lambda x: load_table(f'{x}/deblur/all.biom'), folders))
bioms = dict(list(zip(folders, bioms)))

In [4]:
t = bioms['Chen2020']
total = 0
for f in list(set(folders) - set(['Son2015', 'David2021', 'Cao2021'])):
    print(f, len(set(bioms[f].ids()) & set(md_dict[f].index)))
    total += len(set(bioms[f].ids()) & set(md_dict[f].index))
    if f != 'Chen2020':
        t = t.merge(bioms[f])

Fouquier2021 68
Berding2020 52
Zurita2019 44
Dan2020 279
Zou2020 78
Chen2020 94
Fasano2020 78


In [5]:
filter_f = lambda v, i, m: np.sum(v>0) > 10
age_sex_biom = t.filter(filter_f, axis='observation')
read_f = lambda v, i, m: np.sum(v) > 100
age_sex_biom = age_sex_biom.filter(read_f, axis='sample')

In [6]:
# drop singleton matches
age_sex_biom, age_sex_md = match(age_sex_biom, cs)
vc = age_sex_md['Match_IDs'].value_counts()
drop_list = set(vc.loc[vc==2].index)
idx = age_sex_md.Match_IDs.apply(lambda x: x in drop_list)
age_sex_md = age_sex_md.loc[idx]
age_sex_biom, age_sex_md = match(age_sex_biom, age_sex_md)

In [7]:
from biom.util import biom_open
with biom_open('Combined/age_sex_matched.biom', 'w') as f:
    age_sex_biom.to_hdf5(f, 'age_sex_matched')
age_sex_md.to_csv('Combined/age_sex_match_metadata.txt', sep='\t')

In [8]:
age_sex_biom, age_sex_md.shape

(2655 x 692 <class 'biom.table.Table'> with 153983 nonzero entries (8% dense),
 (692, 8))

# Merge Sibling matched tables

In [9]:
t = bioms['Son2015']
total = 0
for f in ['Son2015', 'David2021']:
    print(f, len(set(bioms[f].ids()) & set(md_dict[f].index)))
    total += len(set(bioms[f].ids()) & set(md_dict[f].index))
    if f != 'Son2015':
        t = t.merge(bioms[f])

Son2015 102
David2021 136


In [10]:
filter_f = lambda v, i, m: np.sum(v>0) > 10
sibling_biom = t.filter(filter_f, axis='observation')
filter_f = lambda v, i, m: np.sum(v) > 100
sibling_biom = sibling_biom.filter(filter_f, axis='sample')

# drop singleton matches
sibling_biom, sibling_md = match(sibling_biom, cs)
vc = sibling_md['Match_IDs'].value_counts()
drop_list = set(vc.loc[vc==2].index)
idx = sibling_md.Match_IDs.apply(lambda x: x in drop_list)
sibling_md = sibling_md.loc[idx]
sibling_biom, sibling_md = match(sibling_biom, cs)

In [11]:
from biom.util import biom_open
with biom_open('Combined/sibling_matched.biom', 'w') as f:
    sibling_biom.to_hdf5(f, 'sibling_matched')
sibling_md.to_csv('Combined/sibling_match_metadata.txt', sep='\t')

In [12]:
sibling_biom, sibling_md.shape

(1388 x 237 <class 'biom.table.Table'> with 46079 nonzero entries (14% dense),
 (237, 8))