In [1]:
import numpy as np
import pandas as pd

## Function Definition

In [67]:
def get_samesubject_sampleids(mdf, mkey, mvals, mskey="subject_id", axis=1, return_subjectid=False):
    def query_col(df, key, val):
        return df.loc[:, df.loc[key, :] == val]

    m = "metadata table is not unique by -mk and -msk"
    subject_ids = sorted(set(mdf.loc[mskey, :]))
    sample_ids = []
    for mval in mvals:
        _sample_ids = []
        for subject_id in subject_ids:
            mdf_ext_step1 = query_col(mdf, mskey, subject_id)
            mdf_ext_step2 = query_col(mdf_ext_step1, mkey, mval)
            if mdf_ext_step2.shape[1] != 1:
                raise ValueError(m)
            _sample_ids.append(mdf_ext_step2.columns[0])
        sample_ids.append(_sample_ids)

    if return_subjectid and axis == 1:
        return sample_ids, subject_ids
    elif not return_subjectid and axis == 1:
        return sample_ids
    elif return_subjectid and axis == 0:
        return [list(x) for x in zip(*sample_ids)], subject_ids
    elif not return_subjectid and axis == 0:
        return [list(x) for x in zip(*sample_ids)]


def calc_samesubject_diff(df, mdf, mkey, mv1s, mv2s, mskey="subject_id"):
    out_df = pd.DataFrame()
    out_mdf = pd.DataFrame()
    for mv1, mv2 in zip(mv1s, mv2s):
        (sid_ctrls, sid_tests), subject_ids = get_samesubject_sampleids(mdf, mkey, [mv1, mv2], mskey=mskey, return_subjectid=True)
        for sid_ctrl, sid_test, subject_id in zip(sid_ctrls, sid_tests, subject_ids):
            new_sid = f"{subject_id}_{mv2}{mv1}"
            out_df[new_sid] = df[sid_test] - df[sid_ctrl]

            for ind in me.index:
                val_ctrl = mdf.loc[ind, sid_ctrl]
                val_test = mdf.loc[ind, sid_test]
                if val_ctrl == val_test:
                    out_mdf.loc[ind, new_sid] = val_ctrl
                else:
                    out_mdf.loc[ind, new_sid] = f"{val_test}{val_ctrl}"
                    
    return out_df, out_mdf


def calc_samesubject_dist(df, mdf, mkey, mvals, mskey="subject_id"):
    out_df = pd.DataFrame()
    (sid_ctrls, sid_tests), subject_ids = get_samesubject_sampleids(me, "timepoint", ["0w", "4w"], mskey="subject_id", axis=1, return_subjectid=True)
    for sid_ctrl, sid_test, subject_id in zip(sid_ctrls, sid_tests, subject_ids):
        new_sid = f"{subject_id}_4w0w"
        out_df.loc["wu", new_sid] = df.loc[sid_ctrl, sid_test]
    return out_df


def extract_df(df, mdf, mkey, mvals, axis=1):
    if mkey not in mdf.index:
        raise ValueError("metadata key in not metadata")

    sample_ids = mdf.columns[mdf.loc[mkey, :].isin(mvals)]
    if len(sample_ids) == 0:
        raise ValueError("can't find sample_ids")

    if axis == 0:
        extracted_df = df.loc[sample_ids, :]
    if axis == 1:
        extracted_df = df.loc[:, sample_ids]
    if axis == 2:
        extracted_df = df.loc[sample_ids, sample_ids]
    extracted_mdf = mdf.loc[:, sample_ids]
    return extracted_df, extracted_mdf

## Loading Data

In [68]:
ge = pd.read_csv("../data_input/ge.tsv", sep="\t", header=0, index_col=0)
co = pd.read_csv("../data_input/co.tsv", sep="\t", header=0, index_col=0)
ra = pd.read_csv("../data_input/ra.tsv", sep="\t", header=0, index_col=0)
ad = pd.read_csv("../data_input/ad.tsv", sep="\t", header=0, index_col=0)
nm = pd.read_csv("../data_input/nm.tsv", sep="\t", header=0, index_col=0)
wu = pd.read_csv("../data_input/wu.tsv", sep="\t", header=0, index_col=0)
uu = pd.read_csv("../data_input/uu.tsv", sep="\t", header=0, index_col=0)
me = pd.read_csv("../data_input/me.tsv", sep="\t", header=0, index_col=0)
nm_all = pd.read_csv("../data_input/nm.all.tsv", sep="\t", header=0, index_col=0)
me_all = pd.read_csv("../data_input/me.all.tsv", sep="\t", header=0, index_col=0)

In [69]:
ge_0001 = ge.loc[ge.mean(axis=1) >= 0.001]
ra_25 = ra.loc[(ra > 0).sum(axis=1) >= ra.shape[1] * 0.25]

In [70]:
ge_0001.to_csv("../data_inter/ge.0001.tsv", sep="\t")
ra_25.to_csv("../data_inter/ra.25.tsv", sep="\t")

## Calculation of difference from baseline

In [71]:
ge_diff_4w, me_diff_4w = calc_samesubject_diff(ge_0001, me, "timepoint", ["0w"], ["4w"], "subject_id")
co_diff_4w, me_diff_4w = calc_samesubject_diff(co, me, "timepoint", ["0w"], ["4w"], "subject_id")
ra_diff_4w, me_diff_4w = calc_samesubject_diff(ra_25, me, "timepoint", ["0w"], ["4w"], "subject_id")
ad_diff_4w, me_diff_4w = calc_samesubject_diff(ad, me, "timepoint", ["0w"], ["4w"], "subject_id")
nm_diff_4w, me_diff_4w = calc_samesubject_diff(nm, me, "timepoint", ["0w"], ["4w"], "subject_id")
nm_diff_2w, me_diff_2w = calc_samesubject_diff(nm_all, me_all, "timepoint", ["0w"], ["2w"], "subject_id")

## Extraction of subjects in each group

In [72]:
ge_C, me_C = extract_df(ge_0001, me, "group", ["C"], axis=1)
ge_T, me_T = extract_df(ge_0001, me, "group", ["T"], axis=1)
co_C, me_C = extract_df(co, me, "group", ["C"], axis=1)
co_T, me_T = extract_df(co, me, "group", ["T"], axis=1)
ra_C, me_C = extract_df(ra_25, me, "group", ["C"], axis=1)
ra_T, me_T = extract_df(ra_25, me, "group", ["T"], axis=1)
ad_C, me_C = extract_df(ad, me, "group", ["C"], axis=1)
ad_T, me_T = extract_df(ad, me, "group", ["T"], axis=1)
nm_C, me_C = extract_df(nm, me, "group", ["C"], axis=1)
nm_T, me_T = extract_df(nm, me, "group", ["T"], axis=1)

## Calculation of the same subject distance

In [73]:
wu_dist = calc_samesubject_dist(wu, me, "timepoint", ["0w", "4w"], "subject_id")
uu_dist = calc_samesubject_dist(uu, me, "timepoint", ["0w", "4w"], "subject_id")

## Exporting Data

In [76]:
ge_diff_4w.to_csv("../data_inter/ge.diff.0w4w.tsv", sep="\t")
co_diff_4w.to_csv("../data_inter/co.diff.0w4w.tsv", sep="\t")
ra_diff_4w.to_csv("../data_inter/ra.diff.0w4w.tsv", sep="\t")
ad_diff_4w.to_csv("../data_inter/ad.diff.0w4w.tsv", sep="\t")
nm_diff_4w.to_csv("../data_inter/nm.diff.0w4w.tsv", sep="\t")
nm_diff_2w.to_csv("../data_inter/nm.diff.0w2w.tsv", sep="\t")
me_diff_4w.to_csv("../data_inter/me.diff.0w4w.tsv", sep="\t")
me_diff_2w.to_csv("../data_inter/me.diff.0w2w.tsv", sep="\t")
wu_dist.to_csv("../data_inter/wu.diff.0w4w.tsv", sep="\t")
uu_dist.to_csv("../data_inter/uu.diff.0w4w.tsv", sep="\t")
ge_C.to_csv("../data_inter/ge.C.tsv", sep="\t")
ge_T.to_csv("../data_inter/ge.T.tsv", sep="\t")
co_C.to_csv("../data_inter/co.C.tsv", sep="\t")
co_T.to_csv("../data_inter/co.T.tsv", sep="\t")
ra_C.to_csv("../data_inter/ra.C.tsv", sep="\t")
ra_T.to_csv("../data_inter/ra.T.tsv", sep="\t")
ad_C.to_csv("../data_inter/ad.C.tsv", sep="\t")
ad_T.to_csv("../data_inter/ad.T.tsv", sep="\t")
nm_C.to_csv("../data_inter/nm.C.tsv", sep="\t")
nm_T.to_csv("../data_inter/nm.T.tsv", sep="\t")
me_C.to_csv("../data_inter/me.C.tsv", sep="\t")
me_T.to_csv("../data_inter/me.T.tsv", sep="\t")