<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Merge-Age-/-Sex-tables-together" data-toc-modified-id="Merge-Age-/-Sex-tables-together-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Merge Age / Sex tables together</a></span></li></ul></div>

This notebook does a double check to make sure that all of the metadata has been standardized.

In [1]:
!ls 

Benchmarking		  Kang2017
Benchmark-metadata.ipynb  Kang-validation-setup.ipynb
Berding2020		  Liu2017
Cao2021			  Liu2019
Chen2020		  Metadata_cleaning.ipynb
Combined		  Metadata-standardization.ipynb
Dan2020			  Pulikkan2018
David2021		  Son2015
Fasano2020		  Zou2020
Fouquier2021		  Zurita2019


In [2]:
import glob
import pandas as pd
from gneiss.util import match
import numpy as np
from biom.util import biom_open

# drop these studies, since they are lacking key data
# Note that we are going to save Kang2017 for another analysis
droplist = {'Liu2019', 'Liu2017', 'Pulikkan2018', 'Kang2017'} 
folders = glob.glob('*')
folders = list(filter(lambda x: 'ipynb' not in x, folders))
folders = list(filter(lambda x: 'Combined' not in x, folders))
folders = list(filter(lambda x: 'Benchmarking' not in x, folders))
folders = list(filter(lambda x: x not in droplist, folders))

mds = list(map(lambda x: pd.read_table(f'{x}/sample_metadata.txt'), folders))
for i in range(len(mds)):
    mds[i] = mds[i].rename(columns={mds[i].columns[0]: 'sampleid'})
    mds[i] = mds[i].set_index('sampleid')
md_dict = dict(list(zip(folders, mds)))

combined = pd.concat(mds, axis=0)
cs = combined[['Age', 'Sex', 'Status', 'Control_Type', 'Cohort', 'Subjects_Location', 'Variable_Region', 'Match_IDs']]

In [3]:
for i, md in enumerate(mds):
    print(md.Cohort[0], md.Variable_Region[0], md.Control_Type[0], sorted(list(md.Status.value_counts().index)))

Fasano2020 V4 Age_Sex_Match ['ASD', 'Control']
Chen2020 V3V4 Age_Sex_Match ['ASD', 'Control']
Fouquier2021 V4 Age_Sex_Match ['ASD', 'Control']
Son2015 V1V2 Sibling_Match ['ASD', 'Control']
David2021 V4 Sibling_Match ['ASD', 'Control']
Zurita2019 V4 Age_Sex_Match ['ASD', 'Control']
Zou2020 V3V4 Age_Sex_Match ['ASD', 'Control']
Dan2020 V4 Age_Sex_Match ['ASD', 'Control']
Berding2020 V3V4 Age_Sex_Match ['ASD', 'Control']
Cao2021 V4 Age_Sex_Match ['ASD', 'Control']


# Merge Age / Sex tables together

In [4]:
from biom import load_table
bioms = list(map(lambda x: load_table(f'{x}/deblur/all.biom'), folders))
bioms = dict(list(zip(folders, bioms)))

In [5]:
def filter_and_save(t, md, i):
    md = md.copy()
    read_f = lambda v, i, m: np.sum(v) > 1000
    t1 = t.filter(read_f, axis='sample', inplace=False)

    # drop singleton matches
    t1, md = match(t1, md)
    vc = md['Match_IDs'].value_counts()
    drop_list = set(vc.loc[vc==2].index)
    idx = md.Match_IDs.apply(lambda x: x in drop_list)
    md = md.loc[idx]
    t1, md = match(t1, md)
    
    filter_f = lambda v, i, m: np.sum(v>0) > 10
    t1 = t1.filter(filter_f, axis='observation', inplace=False)
    with biom_open(f'Benchmarking/matched_{i}.biom', 'w') as f:
        t1.to_hdf5(f, 'age_sex_matched')
        md.to_csv(f'Benchmarking/match_metadata_{i}.txt', sep='\t')

In [6]:
t = bioms['Chen2020']
filter_and_save(t, cs, 0)
total = 0
for i, f in enumerate(list(set(folders) - set(['Son2015', 'David2021', 'Cao2021']))):
    print(f, len(set(bioms[f].ids()) & set(md_dict[f].index)))
    total += len(set(bioms[f].ids()) & set(md_dict[f].index))
    if f != 'Chen2020':
        t = t.merge(bioms[f])
        filter_and_save(t, cs, i)

Fasano2020 78
Chen2020 94
Berding2020 44
Zurita2019 44
Zou2020 78
Fouquier2021 68
Dan2020 279


In [7]:
md

Unnamed: 0_level_0,Sex,Age,Status,Cohort,Subjects_Location,Control_Type,Variable_Region,Description,Match_IDs
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SRR12113737,male,5,Control,Cao2021,China,Age_Sex_Match,V4,SRR12113737,Cao2021_13
SRR12113744,male,5,Control,Cao2021,China,Age_Sex_Match,V4,SRR12113744,Cao2021_3
SRR12113721,female,7,Control,Cao2021,China,Age_Sex_Match,V4,SRR12113721,Cao2021_21
SRR12113734,male,5,Control,Cao2021,China,Age_Sex_Match,V4,SRR12113734,Cao2021_9
SRR12113785,male,8,ASD,Cao2021,China,Age_Sex_Match,V4,SRR12113785,Cao2021_29
...,...,...,...,...,...,...,...,...,...
SRR12113766,male,9,ASD,Cao2021,China,Age_Sex_Match,V4,SRR12113766,Cao2021_28
SRR12113770,male,4,ASD,Cao2021,China,Age_Sex_Match,V4,SRR12113770,Cao2021_27
SRR12113762,male,2,ASD,Cao2021,China,Age_Sex_Match,V4,SRR12113762,Cao2021_25
SRR12113789,male,8,ASD,Cao2021,China,Age_Sex_Match,V4,SRR12113789,Cao2021_18
