<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import biom
import numpy as np
from q2_matchmaker._matching import _matchmaker

from scipy.spatial.distance import pdist, squareform
from scipy.optimize import linear_sum_assignment
import pandas as pd


def _standardize(x):
    return (x - x.min()) / (x.max() - x.min())


def _matchmaker(metadata, status, match_columns, types):
    """ Computes matching ids.
    
    Parameters
    ----------
    metadata : pd.DataFrame
        Sample metadata
    status : str
        Column for specifying case-control status
    match_columns : list of str
        List of metadata categories
    types : list of bool
        Specifies if it is categorical or not.
        True for categorical, False for continuous
    Returns
    -------
    pd.Series : List of matching ids
    """
    md = metadata.sort_values(by=status)
    dummies = []
    for col, cat in zip(match_columns, types):
        if cat:
            df = pd.get_dummies(md[col])
            dummies.append(df)
        else:
            df = pd.DataFrame(_standardize(md[col]))
            dummies.append(df)
    dm = sum(map(lambda x: squareform(pdist(x)) ** 2, dummies))
    i = (md[status].values == md[status].values[0]).sum()
    x, y = linear_sum_assignment(dm[:i, i:])
    y = y + i
    md.loc[md.index[x], 'matching_id'] = x
    md.loc[md.index[y], 'matching_id'] = x
    return md['matching_id']

In [2]:
DIR='../sfari/data/diaimmune'
md = pd.read_table(f'{DIR}/sample_information_from_prep_8810.tsv')

In [3]:
cols = list(md.columns)
cols.sort()
cols

['aab_level',
 'aab_positive',
 'aab_post_fistpos',
 'aab_post_lastneg',
 'abxatcollection',
 'abxexposureabsolute',
 'abxprecollection',
 'age_at_collection',
 'age_at_collection_units',
 'age_at_seroconversion_2_diff_aabs',
 'age_at_t1d_onset',
 'age_at_t1d_onset_units',
 'agedays_haz_last',
 'agedays_waz_last',
 'agedays_zbmi_last',
 'barley',
 'bf',
 'bf_exclusive_duration',
 'bf_exclusive_positive',
 'bf_long_term',
 'breastfeeding_length',
 'breastfeeding_length_exclusive',
 'buckwheat_millet',
 'case_control',
 'cereal',
 'cohort',
 'collection_device',
 'collection_location',
 'collection_method',
 'collection_timestamp',
 'container_1',
 'container_2',
 'corn',
 'country',
 'csection',
 'current_t1d',
 'delivery_route',
 'diabimmune_sample_id',
 'dietary_compound_apple_start_age_month',
 'dietary_compound_banana_start_age_month',
 'dietary_compound_barley_start_age_month',
 'dietary_compound_beef_start_age_month',
 'dietary_compound_beetroot_start_age_month',
 'dietary_compoun

In [4]:
md['case_control']

0      control
1         case
2         case
3         case
4         case
        ...   
121    control
122       case
123       case
124       case
125       case
Name: case_control, Length: 126, dtype: object

In [5]:
md['sex']

0        Male
1      Female
2      Female
3      Female
4        Male
        ...  
121    Female
122    Female
123    Female
124    Female
125    Female
Name: sex, Length: 126, dtype: object

In [6]:
md['age_at_collection']

0       303
1       208
2       249
3       355
4       208
       ... 
121     736
122     722
123    1025
124     629
125     568
Name: age_at_collection, Length: 126, dtype: int64

In [7]:
def diabetes_f(x):
    if x == 'case':
        return 'diabetes'
    return x
md['Status'] = md['case_control'].apply(diabetes_f)

# no age information for the controls ...
confounders = ['age_at_collection', 'sex']
# dia
dia_md = md.loc[md['case_control'] != 'CD']
dia_md = dia_md.dropna(subset=confounders)
dia_md = dia_md.groupby('host_subject_id').last()
dia_md['Match_IDs'] = _matchmaker(dia_md, status='case_control', 
                                 match_columns=confounders, 
                                 types=[False, True])
dia_md = dia_md.dropna(subset=['Match_IDs'])
dia_md['Match_IDs'] = dia_md['Match_IDs'].apply(lambda x: f'Kostic_2016_{int(x)}')
dia_md = dia_md.set_index('sample_id')
dia_md['Cohort'] = 'Kostic_2016'

In [8]:
dia_md.to_csv(f'{DIR}/sample_metadata.txt', sep='\t')

In [9]:
!ls -lhrt ../sfari/data/diaimmune/

total 12M
-rw-rw-r-- 1 jmorton jmorton 2.9M Dec 15  2020 74945_otu_table.biom
-rw-rw-r-- 1 jmorton jmorton 3.1M Dec 15  2020 73074_otu_table.biom
-rw-rw-r-- 1 jmorton jmorton 2.2M Dec 15  2020 12496_20190524-184432.txt
drwxrwsr-x 2 jmorton jmorton  52M Dec 16  2020 formatted
-rw-rw-r-- 1 jmorton jmorton 1.2M Sep 15 20:31 106363_free.biom
-rw-rw-r-- 1 jmorton jmorton 163K Sep 15 20:32 sample_information_from_prep_8810.tsv
-rw-rw-r-- 1 jmorton jmorton 1.4K Sep 15 22:18 diff_abs.sh~
-rw-rw-r-- 1 jmorton jmorton 1.4K Sep 15 22:19 diff_abs.sh
-rw-rw-r-- 1 jmorton jmorton 277K Sep 16 13:48 table.biom
drwxrwsr-x 2 jmorton jmorton    0 Sep 16 13:50 age_sex_matched_posterior
drwxrwsr-x 2 jmorton jmorton 297K Sep 16 13:52 intermediate
-rw-rw-r-- 1 jmorton jmorton 1.9M Sep 16 14:05 106365_none.biom
-rw-rw-r-- 1 jmorton jmorton  26K Sep 16 14:10 sample_metadata.txt


In [10]:
table = biom.load_table(f'{DIR}/106365_none.biom')

In [11]:
def match_f(table, metadata):
    filter_f = lambda v, i, m: i in set(metadata.index)
    t = table.filter(filter_f, axis='sample', inplace=False)
    filter_f = lambda v, i, m: np.sum(v>0) > 0
    t = t.filter(filter_f, axis='observation', inplace=False)
    md = metadata.loc[t.ids()]
    return t, md
table, metadata = match_f(table, dia_md)

In [12]:
from biom.util import biom_open
metadata.to_csv(f'{DIR}/sample_metadata.txt', sep='\t')
with biom_open(f'{DIR}/table.biom', 'w') as f:
    table.to_hdf5(f, 'matched')