<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Amplicon" data-toc-modified-id="Amplicon-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Amplicon</a></span><ul class="toc-item"><li><span><a href="#Price" data-toc-modified-id="Price-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Price</a></span></li><li><span><a href="#Vazquez-Baeza" data-toc-modified-id="Vazquez-Baeza-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Vazquez Baeza</a></span></li><li><span><a href="#Liu-2016" data-toc-modified-id="Liu-2016-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Liu 2016</a></span></li><li><span><a href="#Combine-datasets" data-toc-modified-id="Combine-datasets-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Combine datasets</a></span></li></ul></li><li><span><a href="#Shotgun" data-toc-modified-id="Shotgun-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Shotgun</a></span><ul class="toc-item"><li><span><a href="#Price2019" data-toc-modified-id="Price2019-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Price2019</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import biom
import numpy as np
from q2_matchmaker._matching import _matchmaker

from scipy.spatial.distance import pdist, squareform
from scipy.optimize import linear_sum_assignment
import pandas as pd


def _standardize(x):
    return (x - x.min()) / (x.max() - x.min())


def _matchmaker(metadata, status, match_columns, types):
    """ Computes matching ids.
    
    Parameters
    ----------
    metadata : pd.DataFrame
        Sample metadata
    status : str
        Column for specifying case-control status
    match_columns : list of str
        List of metadata categories
    types : list of bool
        Specifies if it is categorical or not.
        True for categorical, False for continuous
    Returns
    -------
    pd.Series : List of matching ids
    """
    md = metadata.sort_values(by=status)
    dummies = []
    for col, cat in zip(match_columns, types):
        if cat:
            df = pd.get_dummies(md[col])
            dummies.append(df)
        else:
            df = pd.DataFrame(_standardize(md[col]))
            dummies.append(df)
    dm = sum(map(lambda x: squareform(pdist(x)) ** 2, dummies))
    i = (md[status].values == md[status].values[0]).sum()
    x, y = linear_sum_assignment(dm[:i, i:])
    y = y + i
    md.loc[md.index[x], 'matching_id'] = x
    md.loc[md.index[y], 'matching_id'] = x
    return md['matching_id']

# Amplicon

In [2]:
DIR='../sfari/data/sra_ibd_amplicon'

## Price 

Let's split up CD and UC into separate categories

In [3]:
md = pd.read_csv(f'{DIR}/Lloyd_Price2019/SraRunTable.txt')

def ibd_f(x):
    if x == 'nonIBD':
        return 'Control'
    return x
md['Status'] = md['Host_disease'].apply(ibd_f)

# no age information for the controls ...
confounders = ['host_sex']
# UC
uc_md = md.loc[md['Host_disease'] != 'CD']
uc_md = uc_md.dropna(subset=confounders)
uc_md = uc_md.groupby('host_subject_id').first()
uc_md['Match_IDs'] = _matchmaker(uc_md, status='Host_disease', 
                                 match_columns=confounders, 
                                 types=[True])
uc_md = uc_md.dropna(subset=['Match_IDs'])
uc_md['Match_IDs'] = uc_md['Match_IDs'].apply(lambda x: f'Price_UC_2019_{int(x)}')
uc_md = uc_md.set_index('Run')
uc_md['Cohort'] = 'Lloyd_Price2019'

# CD
cd_md = md.loc[md['Host_disease'] != 'UC']
cd_md = cd_md.dropna(subset=confounders)
cd_md = cd_md.groupby('host_subject_id').first()
cd_md['Match_IDs'] = _matchmaker(cd_md, status='Host_disease', 
                                 match_columns=confounders, 
                                 types=[True])
cd_md = cd_md.dropna(subset=['Match_IDs'])
cd_md['Match_IDs'] = cd_md['Match_IDs'].apply(lambda x: f'Price_CD_2019_{int(x)}')
cd_md = cd_md.set_index('Run')
cd_md['Cohort'] = 'Lloyd_Price2019'
cd_md['Status'] = cd_md['Host_disease']
uc_md['Status'] = uc_md['Host_disease']

cd_md.to_csv(f'{DIR}/Lloyd_Price2019/sample_metadata_cd.txt', sep='\t')
uc_md.to_csv(f'{DIR}/Lloyd_Price2019/sample_metadata_uc.txt', sep='\t')

## Vazquez Baeza

In [4]:
md = pd.read_csv(f'{DIR}/Vazquez_Baeza2017/SraRunTable.txt')
def ibd_f(x):
    if x == 'Crohns':
        return 'CD'
    return x
md['Status'] = md['ibd'].apply(ibd_f)

confounders = ['sex']
cd_md2 = md.dropna(subset=confounders)
cd_md2 = cd_md2.groupby('host_subject_id').first()
cd_md2['Match_IDs'] = _matchmaker(cd_md2, status='ibd', 
                                  match_columns=confounders, 
                                  types=[True])
cd_md2 = cd_md2.dropna(subset=['Match_IDs'])
cd_md2['Match_IDs'] = cd_md2['Match_IDs'].apply(lambda x: f'Vazquez_Baeza2017_{int(x)}')
cd_md2 = cd_md2.set_index('Run')
cd_md2['Cohort'] = 'Vazquez_Baeza2017'

cd_md2.to_csv(f'{DIR}/Vazquez_Baeza2017/sample_metadata_cd.txt', sep='\t')

## Liu 2016

In [5]:
md = pd.read_csv(f'{DIR}/Liu2016/SraRunTable.txt', sep='\t')
def ibd_f(x):
    if 'Crohns' in x:
        return 'CD'
    if x == 'Not IBD':
        return 'Control'
md['Status'] = md['diagnosis'].apply(ibd_f)

confounders = ['Host_Sex']
cd_md3 = md.dropna(subset=confounders)
cd_md3 = cd_md3.groupby('SUBJECT').first()
cd_md3['Match_IDs'] = _matchmaker(cd_md3, status='diagnosis', 
                                  match_columns=confounders, 
                                  types=[True])
cd_md3 = cd_md3.dropna(subset=['Match_IDs'])
cd_md3['Match_IDs'] = cd_md3['Match_IDs'].apply(lambda x: f'Liu2016_{int(x)}')
cd_md3 = cd_md3.set_index('Run')
cd_md3['Cohort'] = 'Liu2016'

cd_md3.to_csv(f'{DIR}/Liu2016/sample_metadata_cd.txt', sep='\t')

In [6]:
cd_md3['diagnosis'].value_counts()

Crohn's disease    36
Not IBD            36
Name: diagnosis, dtype: int64

## Combine datasets

In [7]:
combined_md = pd.concat((cd_md, cd_md2, cd_md3), axis=0)

price_biom = biom.load_table(f'{DIR}/Lloyd_Price2019/bioms/all.biom')
vazquez_biom = biom.load_table(f'{DIR}/Vazquez_Baeza2017/bioms/all.biom')
liu_biom = biom.load_table(f'{DIR}/Liu2016/bioms/all.biom')

all_biom = price_biom.merge(vazquez_biom).merge(liu_biom)
all_biom, combined_md = all_biom.align_to_dataframe(combined_md)

from biom.util import biom_open
combined_md.to_csv(f'{DIR}/Combined/combined_metadata.txt', sep='\t')
with biom_open(f'{DIR}/Combined/combined.biom', 'w') as f:
    all_biom.to_hdf5(f, 'combined')

# Shotgun

In [208]:
DIR='../sfari/data/sra_ibd_shotgun'

## Price2019

In [209]:
md = pd.read_csv(f'{DIR}/Lloyd_Price2019/SraRunTable(6).txt')
md['Host_disease'] = md['Host_disease'].fillna('Control')
all_biom = biom.load_table(f'{DIR}/Lloyd_Price2019/bioms/ogus.biom')
all_biom, md = all_biom.align_to_dataframe(md.set_index('Run'))
md = md.reset_index()
status = 'Host_disease'
def ibd_f(x):
    if x == 'nonIBD':
        return 'Control'
    if 'Crohn' in x or x == 'CD':
        return 'CD'
    elif 'colitis' in x or x == 'UC':
        return 'UC'
    return x
md = md.dropna(subset=[status])
md['Status'] = md[status].apply(ibd_f)
status = 'Status'

In [210]:
# CD
cd_md = md.loc[md[status] != 'UC']
cd_md = cd_md.dropna(subset=confounders)
cd_md = cd_md.groupby('host_subject_id').first()
cd_md['Match_IDs'] = _matchmaker(cd_md, status=status, 
                                 match_columns=confounders, 
                                 types=[True])
cd_md = cd_md.dropna(subset=['Match_IDs'])
cd_md['Match_IDs'] = cd_md['Match_IDs'].apply(lambda x: f'Price_CD_2019_{int(x)}')
cd_md = cd_md.set_index('Run')
cd_md['Cohort'] = 'Lloyd_Price2019'
cd_md.to_csv(f'{DIR}/Lloyd_Price2019/sample_metadata_cd.txt', sep='\t')

In [211]:
combined_md = cd_md

combined_md = combined_md.groupby('Match_IDs').filter(lambda x: len(x) == 2)
all_biom, combined_md = all_biom.align_to_dataframe(combined_md)

In [212]:
from biom.util import biom_open
combined_md.to_csv(f'{DIR}/Combined/combined_metadata.txt', sep='\t')
with biom_open(f'{DIR}/Combined/combined.biom', 'w') as f:
    all_biom.to_hdf5(f, 'combined')

In [213]:
md[status].value_counts()

CD         598
UC         375
Control    365
Name: Status, dtype: int64

In [214]:
combined_md.shape, all_biom.shape

((52, 42), (5795, 52))