In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import os

from sklearn.preprocessing import StandardScaler

from tqdm.auto import tqdm
from blood_response.gwas_preprocessing import pheno_quantile_transform

## Sysmex traits

In [None]:
sel_traits_df = pd.read_csv('/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/trait_count_clumps_gated_quantile.tsv', sep='\t')

# adjust the trait names slightly
sel_traits_df['trait2'] = sel_traits_df.trait.str.replace('-q', '')
sel_traits_df['trait2'] = sel_traits_df.trait2.str.replace('plt-f', 'pltf')

In [None]:
sysmex_pheno = pd.read_parquet('/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/all_pheno_projected_filtered_median_wide.parquet')

# rename the columns
sysmex_pheno.columns = [c[1].split('_')[0] + '|' + c[0] + '|' + '_'.join(c[1].split('_')[1:]) for c in sysmex_pheno.columns]
selected_phenotypes = sysmex_pheno[sel_traits_df.trait2]
selected_phenotypes_analysis_hours = sysmex_pheno.iloc[:, 0:37]
selected_phenotypes_analysis_hours.columns = [c.split("|")[1]  for c in selected_phenotypes_analysis_hours.columns]

In [None]:
steroids = pd.read_parquet('/mnt/obi0/phi/ehr/obi_biobank_annotations/steroid_obi_30d_21-07-06.parquet')
transplant = pd.read_parquet('/mnt/obi0/phi/ehr/obi_biobank_annotations/transplant_with_lvad_21-07-06.parquet')

any_steroid_medication = steroids.set_index('PatientID').sum(axis=1).reset_index()
any_steroid_medication = any_steroid_medication.loc[any_steroid_medication[0] > 0].PatientID
any_transplant = transplant.loc[transplant.transplant].PatientID

In [None]:
selected_phenotypes_notransplant = selected_phenotypes.loc[
    ~selected_phenotypes.index.isin(any_transplant)
]
selected_phenotypes_notransplant_nosteroid = selected_phenotypes.loc[
    (~selected_phenotypes.index.isin(any_transplant)) &
    (~selected_phenotypes.index.isin(any_steroid_medication))
]

In [None]:
# specify the cohort for association tests below
model_phenotypes = selected_phenotypes_notransplant

In [None]:
# covariates
covariates =  pd.read_parquet("/mnt/obi0/phi/ehr/obi_biobank_annotations/covariates_21-07-06.parquet")
covariates = covariates[["PatientID", "is_male", "Race", "rc_consent_age"]]

In [None]:
bmp = pd.read_parquet("/mnt/obi0/phi/ehr/obi_biobank_annotations/labs_bmp_21-07-06.parquet")
cmp = pd.read_parquet("/mnt/obi0/phi/ehr/obi_biobank_annotations/labs_cmp_21-07-06.parquet")
ecg = pd.read_parquet("/mnt/obi0/phi/ehr/obi_biobank_annotations/labs_ecg_21-07-06.parquet")
lipids = pd.read_parquet("/mnt/obi0/phi/ehr/obi_biobank_annotations/labs_lp_21-07-06.parquet")

In [None]:
# standardize the lab traits so that beta is comparable
labs = pd.concat([bmp, cmp, ecg, lipids], axis=1)
labs_standardized = pd.DataFrame(StandardScaler().fit_transform(labs),
                                 index=labs.index,
                                 columns=labs.columns).reset_index()

In [None]:
# quantile transform all blood traits
model_phenotypes_quantile = pheno_quantile_transform(model_phenotypes, model_phenotypes.columns)

In [None]:
results_list = []

for lab in tqdm(labs_standardized.columns.values[1:]):
    for pheno in tqdm(model_phenotypes_quantile.columns.values, leave=False):
        channel = pheno.split("|")[0]
        perturbation = pheno.split("|")[1]
        regdf = model_phenotypes_quantile[[pheno]]
        pheno = pheno.replace("|","_").replace(" ","_").replace(".","_")
        regdf.columns = ['sysmex']
        analysis_h = selected_phenotypes_analysis_hours[[perturbation]].rename({perturbation :"draw_analysis_hours"}, axis='columns')
        regdf = regdf.merge(
                            covariates, left_index = True, right_on = "PatientID"
                     ).merge(
                         labs_standardized[[lab, "PatientID"]], on = "PatientID"
                    ).merge(analysis_h, on = "PatientID")
        formula_str = lab + "~ sysmex + rc_consent_age + Race + is_male + draw_analysis_hours"
        #try:
        results = smf.ols(formula=formula_str, data=regdf).fit()

        params = pd.DataFrame(results.params).T
        params = params.rename(columns = {c : 'beta_' +  c for c in params.columns})
        tscores = pd.DataFrame(results.tvalues).T
        tscores = tscores.rename(columns = {c : 't_' +  c for c in tscores.columns})
        pvalues = pd.DataFrame(results.pvalues).T
        pvalues = pvalues.rename(columns = {p : 'p_' +  p for p in pvalues.columns})

        results_list.append(
            pd.concat([pd.DataFrame.from_records([{
                'channel': channel,
                'ptb_name': perturbation,
                'sysmex_pheno': pheno,
                'lab': lab,
                'n_cases': regdf.loc[~regdf.sysmex.isna(), lab].count(),
                'n_total': results.nobs,
                    }]),
                pvalues,
                tscores,
                params,
                ], axis=1))
#         except:
#             pass   # Ignore perfect separation cases

In [None]:
results_df = pd.concat(results_list)
results_df = results_df.sort_values("p_sysmex")
results_df.to_csv("/mnt/obi0/phi/gwas/gwas_analyses/sysmex_custom_gates_v9-obi2020_10_28/clumps_gated_quantile_lab_regression_notransplant_21-07-06_quantile_with_counts.csv", index = False)