<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import numpy as np
from biom import load_table
from q2_matchmaker._matching import _matchmaker

In [2]:
# Sync Cao2021 et al
md4 = pd.read_table('./Cao2021/SraRunTable.txt', sep=',', index_col=0)

md4 = md4[['host_sex', 'Host_Age', 'host_genotype']]
md4['Cohort'] = 'Cao2021'
md4['Subjects_Location'] = 'China'
md4['Control_Type'] = 'Age_Sex_Match'
md4['Variable_Region'] = 'V4'
md4 = md4.rename(columns={'host_sex': 'Sex', 'Host_Age': 'Age',
                          'host_genotype': 'Status'})

def status_f(x):
    if x == 'NT':
        return 'Control'
    else:
        return 'ASD'
    
md4['Status'] = md4['Status'].apply(status_f)
md4['Description'] = md4.index
md4['Match_IDs'] = _matchmaker(md4, status='Status', 
                               match_columns=['Age', 'Sex'], 
                               types=[False, True])
md4 = md4.dropna() 
md4['Match_IDs'] = md4['Match_IDs'].apply(lambda x: f'Cao2021_{int(x)}')
md4.index.rename('sample_name',inplace=True)
md4.to_csv('./Cao2021/sample_metadata.txt',sep='\t')

In [3]:
# Sync Fasano2020
# This study has 42 ASD children (host IDs strating with "A"), 42 moms of ASD children (host IDs starting with "S")
# And 44 age-sex-matched controls (host IDs starting with "T")
md5 = pd.read_table('./Fasano2020/SraRunTable.txt', sep=',', index_col=0)

md5 = md5[['host_subject_id', 'host_age']]
md5['Cohort'] = 'Fasano2020'
md5['Subjects_Location'] = 'China'
md5['Control_Type'] = 'Age_Sex_Match'  # 'Age_Sex_Match_wMoms'
md5['Variable_Region'] = 'V45'
md5['Sex'] = 'Unknown'

md5 = md5.rename(columns={'host_age': 'Age',
                          'host_subject_id': 'sample_name'})

lookup = {'T': 'Control', 'A': 'ASD', 'S': 'ASD_Mom'}
def status_f(x):
    return lookup[x[0]]
    
def household_f(x):
    if x[0] == 'T':
        return 'NA'
    else:
        return x[1:]
    
md5['Status'] = md5['sample_name'].apply(status_f)
md5['Household'] = md5['sample_name'].apply(household_f)
md5['Description'] = md5.index

# drop moms
md5 = md5.loc[md5['Status'] != 'ASD_Mom']
# perform age matching
md5['Match_IDs'] = _matchmaker(md5, status='Status', 
                               match_columns=['Age'], 
                               types=[False])
md5 = md5.dropna(subset=['Match_IDs'])

md5['Match_IDs'] = md5['Match_IDs'].apply(lambda x: f'Fasano2020_{int(x)}')

md5.index.rename('sample_name',inplace=True)
#md5 = md5.loc[~md5.index.duplicated(keep='first')]
md5.to_csv('./Fasano2020/sample_metadata.txt',sep='\t')

In [4]:
#Sync Chen2020 metadata
md6 = pd.read_table('./Chen2020/SraRunTable.txt', sep=',', index_col=0)
md6_ex = pd.read_table('./Chen2020/sample_metadata_RM.txt',sep='\t',index_col=0)
md6=md6.merge(md6_ex,left_on='Sample Name',right_index=True)

#Metadata is a bit limited on which group is which I've assumed samples with 
# "JSH0A" structure are autism samples (These have correct number of samples according to study),
#Samples with "JSH0M" are mom samples, "JSH0C" is TD children and "JSH0D" are TD Moms.

md6 = md6[['Sample Name', 'AGE','sex','Household','Diagnosis']]
md6['Cohort'] = 'Chen2020'
md6['Subjects_Location'] = 'China'
md6['Control_Type'] = 'Age_Sex_Match'  # 'Age_Sex_Match_wMoms'
md6['Variable_Region'] = 'V34'

md6 = md6.rename(columns={'AGE': 'Age',
                          'Sample Name': 'sample_name', 'sex':'Sex'})

def status_f(x):
    if x == 'TD':
        return 'Control'
    elif x == 'ASD':
        return 'ASD'
    elif x == 'ASDM':
        return 'ASD_Mom'
    else:
        return 'Control_Mom'

md6['Status'] = md6['Diagnosis'].apply(status_f)
md6 = md6.loc[md6['Age'] != 'not collected']
md6['Age'] = md6['Age'].astype(np.float64)
# drop moms
md6 = md6.loc[~md6['Status'].apply(lambda x: 'Mom' in x)]
# perform age matching
md6['Match_IDs'] = _matchmaker(md6, status='Status', 
                               match_columns=['Age', 'Sex'], 
                               types=[False, True])
md6 = md6.dropna(subset=['Match_IDs'])

md6['Match_IDs'] = md6['Match_IDs'].apply(lambda x: f'Chen2020_{x}')

md6['Description'] = md6.index
md6.rename(columns={'sample_name':'original_name'},inplace=True)
md6.index.rename('sample_name',inplace=True)
md6.to_csv('./Chen2020/sample_metadata.txt',sep='\t')

In [5]:
#Sync Son2015 metadata
md7 = pd.read_table('./Son2015/SraRunTable.txt', sep=',', index_col=0)
md7_ex = pd.read_table('./Son2015/sample_metadata_RM.txt',sep='\t',index_col=0)
md7 = md7.merge(md7_ex,left_index=True,right_on="file_name")
md7.index = md7['file_name']

#Metadata is a bit limited on which group is which which. F_001C1_A structure is within sample alias.
#I will assume F_00X structure is for "Family" -> Add this number to Household column
#I will assume _A or _U structure is for "Autism" and control -> Add ASD to Diagnosis column for A's, add TD to Diagnosis column for U's.
#I will assume C1/2 structure is for collection number 1 vs 2. -> Add this number to "Collection_Number" Column.
md7 = md7[['Household','Diagnosis','original_name','host_sex']]
md7 = md7.rename(columns={'host_sex':'Sex'})

md7['Cohort'] = 'Son2015'
md7['Subjects_Location'] = 'USA'
md7['Control_Type'] = 'Sibling_Match'
md7['Variable_Region'] = 'V12'

# Read in Age data
md = pd.read_excel('./Son2015/Demographic Table2.xlsx', engine='openpyxl')
age = pd.to_datetime(md['Date_completion']) - pd.to_datetime(md['Child_DOB'])
# https://stackoverflow.com/a/59960315/1167475
md['Age'] = age / np.timedelta64(1, 'Y')
md7 = md7.reset_index()
md7['ID'] = md7['original_name'].apply(lambda x: x.split('_')[1].replace('-', ''))
md['FULL ID'] = md['FULL ID'].apply(lambda x: x.replace('-', ''))
md7 = pd.merge(md7, md, left_on='ID', right_on='FULL ID')

lookup = {'TD': 'Control', 'ASD': 'ASD'}    
md7['Match_IDs'] = md7['Household']
md7['Status'] = md7['Diagnosis'].apply(lambda x: lookup[x])
md7['Description'] = md7.index
md7 = md7.set_index('file_name')
md7.to_csv('./Son2015/sample_metadata.txt',sep='\t')

In [6]:
# Kang2017 et al

###FMT Study - metadata a bit more complex
md9 = pd.read_table('./Kang2017/SraRunTable.txt', sep=',', index_col=0)
md9 = md9[['GROUP','collection-method','host_subject_id','weeks-since-experiment-start',
           'host_sex','host_age_at_week_0','cars']]
md9['Cohort'] = 'Kang2017'
md9['Subjects_Location'] = 'USA'
md9['Control_Type'] = 'Age_Sex_Match'
md9['Variable_Region'] = 'V4'
md9.rename(columns={'host_sex':'Sex','host_age_at_week_0':'Age'},inplace=True)

md9['GROUP'].drop_duplicates()

def status_f(x):
    if x == 'autism':
        return 'ASD'
    elif x == 'neurotypical':
        return 'Control'
    elif x == 'neurotypical_mom':
        return 'Control_Mom'
md9['Status'] = md9['GROUP'].apply(status_f)
md9 = md9.loc[~md9.index.duplicated(keep='first')]
md9.to_csv('./Kang2017/sample_metadata.txt',sep='\t')

In [7]:
# Fouquier 2021
md10 = pd.read_csv('Fouquier2021/SraRunTable.txt')
md10 = md10.set_index('Run')
md10 = md10.rename(columns={'age_in_years': 'Age', 'gender': 'Sex'})
md10 = md10.loc[md10['Age'] != 'missing']
md10['Control_Type'] = 'Age_Sex_Match'
md10['Age'] = md10['Age'].astype(np.float64)
lookup = {'No': 'Control', 'Yes': 'ASD'}
md10['Status'] = md10.asd.apply(lambda x: lookup[x])
md10['Cohort'] = 'Fouquier2021'
md10['Variable_Region'] = 'V4'
md10['Match_IDs'] = _matchmaker(md10, status='Status', 
                               match_columns=['Age', 'Sex'], 
                               types=[False, True])
md10 = md10.dropna()
md10['Match_IDs'] = md10['Match_IDs'].apply(lambda x: f'Fouquier2021_{int(x)}')
md10.to_csv('Fouquier2021/sample_metadata.txt', sep='\t')

In [8]:
# David 2021
md11 = pd.read_table('./David2021/sample_metadata_JM.txt', dtype=str)
md11['Matching_IDs'] = md11['Pair']
md11['Control_Type'] = 'Sibling_Match'

lookup = {'Aut': 'ASD', 'Control': 'Control'}
md11['Status'] = md11.Treatment.apply(lambda x: lookup[x])
def age_f(x):
    if pd.isnull(x):
        return x
    else:
        return float(x) / 12
def sex_f(x):
    if x == 'm':
        return 'male'
    elif x == 'f':
        return 'female'
    return x

md11['Age'] = md11['age_month_ok'].apply(age_f)
md11['Sex'] = md11.Gender.apply(sex_f)
md11 = md11.set_index('SampleID')
md11['Cohort'] = 'David2021'
md11['Variable_Region'] = 'V4'
md11['Match_IDs'] = md11['Pair'].apply(lambda x: f'David2021_{int(x)}')
md11.to_csv('David2021/sample_metadata.txt', sep='\t')

In [9]:
# Berding 2020
md12 = pd.read_table('./Berding2020/sample_metadata_JM.txt', dtype=str)
md12['Age'] = md12['Age'].astype(np.float64)
md12 = md12.dropna(subset=['Age', 'Sex'])
#del md12['matching_id']
md12 = md12.iloc[:, :-1]
lookup = {'CONT': 'Control', 'ASD': 'ASD'}
md12['Status'] = md12['Status'].apply(lambda x: lookup[x])
md12['Match_IDs'] = _matchmaker(md12, status='Status', 
                                match_columns=['Age', 'Sex'], 
                                types=[False, True])
md12 = md12.dropna(subset=['Match_IDs'])
md12['Match_IDs'] = md12['Match_IDs'].apply(lambda x: f'Berding2020_{int(x)}')
md12['Control_Type'] = 'Age_Sex_Match'
md12['Variable_Region'] = 'V34'
md12 = md12.set_index('#SampleID')
md12 = md12.loc[~md12.index.duplicated(keep='first')]
md12 = md12.groupby('Match_IDs').filter(lambda x: x.iloc[0]['Status'] != x.iloc[1]['Status'])
md12.sort_values(['Match_IDs', 'Status'])[['Match_IDs', 'Status', 'Age', 'Sex']]

md12.to_csv('Berding2020/sample_metadata.txt', sep='\t')

In [10]:
# Zou 2020
md13 = pd.read_excel('Zou2020/Zou_et_al_2020.xlsx')
sra = pd.read_csv('Zou2020/SraRunTable.txt')
md13 = pd.merge(sra, md13, left_on='Library Name', right_on='Seq.No.')
# freaking unicode characters ...
md13 = md13.rename(columns={'Gender': 'Sex', 'Age（year）': 'Age', 'Run': 'sampleid'})

def status_f(x):
    if 'case' in x:
        return 'ASD'
    else:
        return 'Control'

md13['Status'] = md13['Seq.No.'].apply(status_f)
md13['Match_IDs'] = _matchmaker(md13, status='Status', 
                                match_columns=['Age', 'Sex'], 
                                types=[False, True])
md13 = md13.dropna(subset=['Match_IDs'])
md13['Match_IDs'] = md13['Match_IDs'].apply(lambda x: f'Zou2020_{int(x)}')
md13 = md13.set_index('sampleid')
md13['Variable_Region'] = 'V34'
md13['Cohort'] = 'Zou2020'
md13['Control_Type'] = 'Age_Sex_Match'
md13 = md13.loc[~md13.index.duplicated(keep='first')]
md13.to_csv('Zou2020/sample_metadata.txt', sep='\t')

In [11]:
# Dan 2020
md14 = pd.read_table('./Dan2020/sample_metadata_JM.txt', dtype=str)
md14 = md14.rename(columns={'match_ids': 'Match_IDs'})
md14['Variable_Region'] = 'V4'
md14['Cohort'] = 'Dan2020'
md14['Control_Type'] = 'Age_Sex_Match'
md14 = md14.loc[~md14.index.duplicated(keep='first')]
md14 = md14.set_index('sampleid')
md14.to_csv('Dan2020/sample_metadata.txt', sep='\t')

In [12]:
# Zurita
md15 = pd.read_table('./Zurita2019/sample_metadata_JM.txt', dtype=str)
md15 = md15.rename(columns={'match_ids': 'Match_IDs'})
md15 = md15.loc[~md15.index.duplicated(keep='first')]
md15['Control_Type'] = 'Age_Sex_Match'
md15['Variable_Region'] = 'V4'
md15 = md15.set_index('#SampleID')
md15['Age'] = md15['Age'].astype(np.float64)

md15['Match_IDs'] = _matchmaker(md15, status='Status', 
                                match_columns=['Age', 'Sex'], 
                                types=[False, True])
md15 = md15.dropna(subset=['Match_IDs'])
md15['Match_IDs'] = md15['Match_IDs'].apply(lambda x: f'Zurita2019_{int(x)}')
md15.to_csv('Zurita2019/sample_metadata.txt', sep='\t')