<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Noto-2014" data-toc-modified-id="Noto-2014-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Noto 2014</a></span></li><li><span><a href="#Needham-2020" data-toc-modified-id="Needham-2020-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Needham 2020</a></span></li><li><span><a href="#Kuwabara-2013" data-toc-modified-id="Kuwabara-2013-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Kuwabara 2013</a></span></li><li><span><a href="#Noto-2014" data-toc-modified-id="Noto-2014-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Noto 2014</a></span></li><li><span><a href="#West-2014" data-toc-modified-id="West-2014-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>West 2014</a></span></li><li><span><a href="#Kang-2018" data-toc-modified-id="Kang-2018-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Kang 2018</a></span></li></ul></div>

In [75]:
import os
import qiime2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from q2_matchmaker._matching import _matchmaker
from biom.util import biom_open
import biom
%matplotlib inline
data_dir = '~/ceph/sfari/data/metabolomics'

def save_qiime2_metadata(filename, df, types : dict = None):
    df_ = df.copy()
    df_.index.name = 'sampleid'
    if types is not None:
        df_ = df_.astype(types)
    md = qiime2.Metadata(df_)
    md.save(filename)

# Noto 2014

In [105]:
res_dir = '../sfari/data/metabolomics/Noto2014'
noto_urine = pd.read_excel(os.path.join(res_dir, 'Controlli vs ASD_Mussap_2020.xlsx'))
noto_md = pd.read_excel(os.path.join(res_dir, 'METADATA_mussap_2020.xlsx'))
noto_md = noto_md.dropna(subset=['AGE', 'GENDER'])
noto_urine = noto_urine.set_index('NAME')
noto_urine_ms = noto_urine.iloc[:, 1:].fillna(0)
noto_urine_md = noto_md.set_index('collection code')

def assign_f(x):
    if 'ASD' in x:
        return 'ASD'
    else:
        return 'Control'
noto_urine_ms, noto_urine_md = noto_urine_ms.align(noto_urine_md, join='inner', axis=0)
noto_urine_md['Status'] = noto_urine_md['GROUP'].apply(assign_f)

noto_urine_md['Match_IDs'] = _matchmaker(
    noto_urine_md, 'Status', ['AGE', 'GENDER'], types=[False, True])
noto_urine_md = noto_urine_md.dropna(subset=['Match_IDs'])
noto_urine_md['Match_IDs'] = noto_urine_md['Match_IDs'].astype(np.int64)
noto_urine_ms, noto_urine_md = noto_urine_ms.align(noto_urine_md, join='inner', axis=0)

def rename_f(x):
    return x.replace('#', 'NOTO')
noto_urine_md.index = list(map(rename_f, noto_urine_md.index))
noto_urine_ms.index = list(map(rename_f, noto_urine_ms.index))

save_qiime2_metadata(f'{res_dir}/urine_sample_metadata.txt',
                     noto_urine_md[['Status', 'Match_IDs']], types={'Match_IDs': 'str'})
with biom_open(f'{res_dir}/urine.biom', 'w') as f:
    table = biom.Table(noto_urine_ms.values.T, noto_urine_ms.columns, noto_urine_ms.index)
    table.to_hdf5(f, 'urine')

  warn(msg)


# Needham 2020

Relevant paper 
https://www.biologicalpsychiatryjournal.com/article/S0006-3223(20)31985-5/fulltext

In [98]:
res_dir = '../sfari/data/metabolomics/Needham2020'
needham = os.path.join(data_dir, 'Needham2020', 'mmc2.xlsx')
# parse out plasma metabolites
needham_plasma = pd.read_excel(needham, sheet_name = 'Table S2. OrigScale HumPlas')
#needham_plasma = pd.read_excel(needham, sheet_name = 'Table S3. ScaledImpData HumPlas')
needham_plasma_md = needham_plasma.iloc[:25, 11:].T
needham_plasma_md = needham_plasma.iloc[:25, 11:].T
needham_plasma_md.columns = needham_plasma.iloc[:25, 10].values
needham_plasma_md = needham_plasma_md.iloc[:-1]
needham_plasma_ms = needham_plasma.iloc[25:, 11:]
needham_plasma_ms.index = needham_plasma.iloc[:, 0].dropna().values[1:]
needham_plasma_ms = needham_plasma_ms.iloc[:, :-1]
needham_plasma_ms = needham_plasma_ms.fillna(0)
plasma_metabolite_md = needham_plasma.iloc[25:, :11]
plasma_metabolite_md.columns = needham_plasma.iloc[24, :11].values
# wtf excel ...
needham_plasma_md = needham_plasma_md.rename(columns={'                                      Group HMDB': 'Group HMDB'})

def group_f(x):
    if x == 'ASD Yes':
        return 'ASD'
    else:
        return 'Control'
needham_plasma_md['Status'] = needham_plasma_md['Group HMDB'].apply(group_f)


needham_plasma_md['AGE'] = needham_plasma_md['AGE'].replace('ND', np.nan)
needham_plasma_md = needham_plasma_md.dropna(subset=['AGE'])
needham_plasma_md['Match_IDs'] = _matchmaker(
    needham_plasma_md, 'Status', ['AGE', 'GENDER'], types=[False, True])
needham_plasma_md = needham_plasma_md.dropna(subset=['Match_IDs'])
needham_plasma_md['Match_IDs'] = needham_plasma_md['Match_IDs'].astype(np.int64)
needham_plasma_ms = needham_plasma_ms[needham_plasma_md.index]
plasma_metabolite_md.to_csv(os.path.join(data_dir, 'Needham2020', 'plasma_metabolite_metadata.txt'), sep='\t')
save_qiime2_metadata(f'{res_dir}/plasma_sample_metadata.txt',
                     needham_plasma_md[['Status', 'Match_IDs']], types={'Match_IDs': 'str'})
with biom_open(f'{res_dir}/plasma.biom', 'w') as f:
    table = biom.Table(needham_plasma_ms.values, needham_plasma_ms.index, needham_plasma_ms.columns)
    table.to_hdf5(f, 'plasma')

In [3]:
mu = needham_plasma_ms.values.mean()
sigma = needham_plasma_ms.values.std()
print('mu', mu, 'sigma', sigma)

mu 19429836.67734174 sigma 396457334.7573488


In [4]:
needham_plasma_ms.shape, needham_plasma_md.shape, table.shape

((1611, 80), (80, 27), (1611, 80))

In [101]:
# parse out fecal metabolites
needham_fecal = pd.read_excel(needham, sheet_name = 'Table S5. OrigScale HumFec')
#needham_fecal = pd.read_excel(needham, sheet_name = 'Table S6. ScaledImpData HumFec')
needham_fecal_md = needham_fecal.iloc[:17, 11:].T
needham_fecal_md = needham_fecal.iloc[:17, 11:].T
needham_fecal_md.columns = needham_fecal.iloc[:17, 10].values
needham_fecal_md = needham_fecal_md.iloc[:-1]
needham_fecal_ms = needham_fecal.iloc[17:, 11:]
needham_fecal_ms.index = needham_fecal.iloc[:, 0].dropna().values[1:]
needham_fecal_ms = needham_fecal_ms.iloc[:, :-1]
needham_fecal_ms = needham_fecal_ms.fillna(0)
needham_fecal_md = needham_fecal_md.rename(
    columns={'                                      Group HMDB': 'Group HMDB'})
fecal_metabolite_md = needham_fecal.iloc[17:, :11]
fecal_metabolite_md.columns = needham_fecal.iloc[16, :11].values
needham_fecal_md['Status'] = needham_fecal_md['Group HMDB'].apply(group_f)
needham_fecal_md['AGE'] = needham_fecal_md['AGE'].replace('.', np.nan)
needham_fecal_md = needham_fecal_md.dropna(subset=['AGE'])
needham_fecal_md['Match_IDs'] = _matchmaker(
    needham_fecal_md, 'Status', ['AGE', 'GENDER'], types=[False, True])
needham_fecal_md = needham_fecal_md.dropna(subset=['Match_IDs'])
needham_fecal_md['Match_IDs'] = needham_fecal_md['Match_IDs'].astype(np.int64)
needham_fecal_ms = needham_fecal_ms[needham_fecal_md.index]
needham_fecal_md.to_csv(os.path.join(data_dir, 'Needham2020', 'fecal_sample_metadata.txt'), sep='\t')
fecal_metabolite_md.to_csv(os.path.join(data_dir, 'Needham2020', 'fecal_metabolite_metadata.txt'), sep='\t')
save_qiime2_metadata(f'{res_dir}/fecal_sample_metadata.txt',
                     needham_fecal_md[['Status', 'Match_IDs']], types={'Match_IDs': 'str'})
with biom_open('../sfari/data/metabolomics/Needham2020/fecal.biom', 'w') as f:
    table = biom.Table(needham_fecal_ms.values, needham_fecal_ms.index, needham_fecal_ms.columns)
    table.to_hdf5(f, 'fecal')

In [6]:
needham_fecal_ms.shape, needham_fecal_md.shape

((814, 44), (44, 19))

In [7]:
mu = needham_fecal_ms.values.mean()
sigma = needham_fecal_ms.values.std()
print('mu', mu, 'sigma', sigma)

mu 46429195.47040479 sigma 397690723.927873


# Kuwabara 2013

Relevant paper
https://pubmed.ncbi.nlm.nih.gov/24058493/

In [8]:
res_dir = '../sfari/data/metabolomics/Kuwabara2013'
kuwabara = os.path.join(data_dir, 'Kuwabara2013', 'metabolomics_absolute_age.xlsx')
kuwabara = pd.read_excel(kuwabara)
kuwabara_md = kuwabara.iloc[1:3].T.dropna().set_index(1)
def status_f(x):
    # not sure what CT-493 is ...
    if 'ASO' in x:
        return 'ASD'
    else:
        return 'Control'
kuwabara_md['Status'] = list(map(status_f, kuwabara_md.index))
kuwabara_md = kuwabara_md.rename(columns={2: 'age'})
kuwabara_md.index.name = 'sampleid'
kuwabara_ms = kuwabara.iloc[3:, 5:]
kuwabara_ms.index = kuwabara['KEGG ID'].dropna().values
kuwabara_ms = kuwabara_ms.replace('N.D.', 0)
kuwabara_ms = kuwabara_ms.T.dropna().T  # remove nan columns
kuwabara_ms.columns = kuwabara_md.index
kuwabara_md['age'] = kuwabara_md['age'].astype(np.float64)

In [9]:
kuwabara_ms.shape, kuwabara_md.shape

((112, 20), (20, 2))

In [10]:
kuwabara_md['Match_IDs'] = _matchmaker(
    kuwabara_md, 'Status', ['age'], types=[False])
kuwabara_md = kuwabara_md.dropna(subset=['Match_IDs'])
kuwabara_md['Match_IDs'] = kuwabara_md['Match_IDs'].astype(np.int64)
kuwabara_ms = kuwabara_ms[kuwabara_md.index]
#kuwabara_md.to_csv(os.path.join(data_dir, 'Kuwabara2013', 'plasma_sample_metadata.txt'), sep='\t')
save_qiime2_metadata(f'{res_dir}/plasma_sample_metadata.txt',
                     kuwabara_md[['Status', 'Match_IDs']], types={'Match_IDs': 'str'})
with biom_open('../sfari/data/metabolomics/Kuwabara2013/plasma.biom', 'w') as f:
    table = biom.Table(kuwabara_ms.values, kuwabara_ms.index, kuwabara_ms.columns)
    table.to_hdf5(f, 'plasma')

In [11]:
kuwabara_ms.shape, kuwabara_md.shape

((112, 20), (20, 3))

In [12]:
age_diffs = kuwabara_md[['age', 'Match_IDs']].groupby('Match_IDs').diff().dropna()
print(age_diffs.mean().values[0], age_diffs.std().values[0])

-0.7 3.653004851412662


In [13]:
mu = kuwabara_ms.values.mean()
sigma = kuwabara_ms.values.std()
print('mu', mu, 'sigma', sigma)

mu 44.45148566083439 sigma 190.01866864545033


# Noto 2014

In [14]:
# missing age / sex information
noto = os.path.join(data_dir, 'Noto2014', 'Controlli vs ASD_Mussap_2020.xlsx')
noto = pd.read_excel(noto)

  warn(msg)


# West 2014

In [15]:
# it looks like they only have mass and retention time unfortunately
west = os.path.join(data_dir, 'West2014', 'pone.0112445.s005.xlsx')
west = pd.read_excel(west)

# Kang 2018

In [16]:
data_dir = '~/ceph/sfari/data/metabolomics'
res_dir = '../sfari/data/metabolomics/Kang2018'
kang = os.path.join(data_dir, 'Kang2018', 'Kangetal2018.xlsx')
kang_md = pd.read_excel(kang, sheet_name='Metadatabase', skiprows=1).set_index('Sample ID').T
kang_md = kang_md.dropna(subset=['age', 'Group'])
kang_md['Status'] = kang_md['Group']

kang = os.path.join(data_dir, 'Kang2018', 'Kangetal2018.xlsx')
kang_ms = pd.read_excel(kang, sheet_name='metabolites', skiprows=1)
kang_ms = kang_ms.set_index('Sample ID')
kang_ms = kang_ms.dropna()
idx = kang_ms.sum(axis=1) > 0
kang_ms = kang_ms.loc[idx]
kang_ms = kang_ms[kang_md.index]

In [17]:
kang_ms.shape, kang_md.shape

((59, 44), (44, 28))

In [18]:
kang_md['Match_IDs'] = _matchmaker(
    kang_md, 'Status', ['age', 'Gender'], types=[False, True])
kang_md = kang_md.dropna(subset=['Match_IDs'])
kang_md['Match_IDs'] = kang_md['Match_IDs'].astype(np.int64)
kang_ms = kang_ms[kang_md.index]
#kang_md.to_csv(os.path.join(data_dir, 'Kang2018', 'fecal_metabolite_metadata.txt'), sep='\t')
save_qiime2_metadata(f'{res_dir}/fecal_sample_metadata.txt',
                     kang_md[['Status', 'Match_IDs']], types={'Match_IDs': 'str'})
with biom_open('../sfari/data/metabolomics/Kang2018/fecal.biom', 'w') as f:
    table = biom.Table(kang_ms.values, kang_ms.index, kang_ms.columns)
    table.to_hdf5(f, 'fecal')

In [19]:
mu = kang_ms.values.mean()
sigma = kang_ms.values.std()
print('mu', mu, 'sigma', sigma)

mu 7.941069833522684 sigma 27.74041650248653


In [20]:
kang_ms.shape, kang_md.shape

((59, 42), (42, 29))