## Notebook to rerun age regression for the age associated GEX features conditioned *cis* correlated ATAC features that are also age associated

only interest in gene's with a statistically significant age effect and the ATAC peaks that are <i>cis</i> proximal also with a statistically significant age affect

In [None]:
!date

#### import libraries

In [None]:
from pandas import DataFrame as PandasDF, read_parquet, read_csv
import statsmodels.api as sm
import statsmodels.formula.api as smf
from os.path import exists
import numpy as np
from multiprocessing import Process

import warnings
warnings.filterwarnings('ignore')

#### set notebook variables

In [None]:
# parameters
endogenous = 'GEX'
exogenous = 'ATAC'
category = 'curated_type' # 'curated_type' for broad and 'cluster_name' for specific
REGRESSION_TYPE = 'glm_tweedie'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'

# in files
endo_results_file = f'{results_dir}/{project}.{endogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
exo_results_file = f'{results_dir}/{project}.{exogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
cis_results_file = f'{results_dir}/{project}.{endogenous}-{exogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.cis.csv'
info_file = f'{info_dir}/{project}.sample_info.csv'

# out files

# constants
DEBUG = False
covariate_terms = ['sex', 'ancestry', 'pmi', 'ph', 'smoker', 'bmi', 'gex_pool', 'atac_pool']
covar_term_formula = ' + '.join(covariate_terms)
if DEBUG:
    print(covar_term_formula)
TESTING = False
TEST_FEATURE_SIZE = 25

#### functions

In [None]:
def load_quantification(cell_name: str, modality: str, verbose: bool=False) -> PandasDF:
    this_file = f'{quants_dir}/{project}.{modality}.{prefix_type}.{cell_name}.pb.parquet'
    if not exists(this_file):
        return None
    df = read_parquet(this_file)
    if verbose:
        print(f'shape of read {cell_name} quantifications {df.shape}')        
        display(df.sample(5))
    return df

def glm_model(formula: str, df: PandasDF, model_type: str='rlm'):
    if model_type == 'glm_tweedie':
        model = smf.glm(formula=formula, data=df, 
                        family=sm.families.Tweedie(link=sm.families.links.log(), 
                                                   var_power=1.6, 
                                                   eql=True))
    elif model_type == 'rlm':
        model = smf.rlm(formula=formula, data=df)        
    elif model_type == 'glm':
        model = smf.glm(formula=formula, data=df)        
    result = model.fit()
    return result

def conditioned_cis_regression(df: PandasDF, endo_term: str, exog_terms: list, verbose: bool=False) -> tuple:
    model_terms = ['age', endo_term] + exog_terms + covariate_terms + ['cell_count_endo', 'cell_count_exog']
    # for pairwise coniditioning there will only be one exogenous cis feature
    if len(exog_terms) == 1:
        conditioned_term = exog_terms[0]
        this_formula = f'Q("{endo_term}") ~ age + Q("{conditioned_term}") + {covar_term_formula} + cell_count_endo + cell_count_exog'
    else:
        conditioned_term = 'multi'
        quoted_exogs_formula = ''.join([f' + Q("{exog_term}")' for exog_term in exog_terms])
        this_formula = f'Q("{endo_term}") ~ age{quoted_exogs_formula} + {covar_term_formula} + cell_count_endo + cell_count_exog'
    try:
        # run GLM via statsmodel
        result = glm_model(this_formula, df[model_terms], model_type=REGRESSION_TYPE)
        ret_exog_term = 'age'
        ret_list = [endo_term, conditioned_term, result.params['Intercept'], 
                    result.params[ret_exog_term], result.bse[ret_exog_term], 
                    result.tvalues[ret_exog_term], result.pvalues[ret_exog_term]]
        if verbose:
            print(f'df shape {df.shape}')
            print(result.summary())
            print(['endo_feature', 'exog_feature', 'intercept', 'coef', 'stderr', 'z', 'p-value'])
            print(ret_list)
    except:
#         print(f'Caught Error for {endo_term}')
        ret_list = [endo_term, conditioned_term] + [np.nan] * 5
    return ret_list

def load_tissue_quants(tissue: str, endo_ids: set, exog_ids: set, 
                       verbose: bool=False) -> {PandasDF, PandasDF}: 
    endo_data = load_quantification(tissue, endogenous)
    exog_data = load_quantification(tissue, exogenous)
    if verbose:
        print(f'shape of endogenous data {endo_data.shape}')
        print(f'shape of exogenous data {exog_data.shape}')
    # subset to only needed endo and exog features
    endo_data = endo_data[list(set(endo_data.columns) & endo_ids) + ['cell_count']]
    exog_data = exog_data[list(set(exog_data.columns) & exog_ids) + ['cell_count']]
    if verbose:
        print(f'shape of subset endogenous data {endo_data.shape}')
        print(f'shape of subset exogenous data {exog_data.shape}')        
        display(endo_data.sample(4))
        display(exog_data.sample(4))
    return endo_data, exog_data

def merge_analysis_data(endo_data: PandasDF, exog_data: PandasDF, covars_df: PandasDF, 
                        endo_ids: set, exog_ids: set, verbose: bool=False) -> PandasDF:
    # subset to only needed endo and exog features
    endo_data = endo_data[list(set(endo_data.columns) & endo_ids) + ['cell_count']]
    exog_data = exog_data[list(set(exog_data.columns) & exog_ids) + ['cell_count']]
    tissue_data = (endo_data.merge(exog_data, how='inner', left_index=True, right_index=True, suffixes=('_endo', '_exog'))
                   .merge(covars_df, how='inner', left_index=True, right_index=True))
    if verbose:
        print(f'shape of merged data is {tissue_data.shape}')        
        display(tissue_data.sample(5))
    return tissue_data

def conditional_regressions(data: PandasDF, prev_results: PandasDF, verbose: bool=False) -> PandasDF:
    # run pairwise conditioning 
    pair_results = [conditioned_cis_regression(data, row.endo_feature, [row.exog_feature]) 
                    for row in prev_results.itertuples()]    
    # run multi feature conditioning
    multi_pairs = prev_results.groupby('endo_feature').exog_feature.unique()
    multi_results = [conditioned_cis_regression(data, feature, list(cis_features)) 
                     for feature, cis_features in multi_pairs.items() if len(cis_features) > 1]    
    results_df = PandasDF(data=pair_results+multi_results, 
                          columns=['endo_feature', 'exog_feature', 'intercept', 
                                   'coef', 'stderr', 'z', 'p-value'])

    if verbose:
        print(f'shape of results {results_df.shape}')
        display(results_df.head())
    return results_df

def analyze_tissue(tissue: str, prev_results: PandasDF, covars_df: PandasDF, 
                   verbose: bool=False):
    endo_ids = set(prev_results.endo_feature)
    exog_ids = set(prev_results.exog_feature)
    # load quants data
    endo_data, exog_data = load_tissue_quants(tissue, endo_ids, exog_ids, verbose)
    # merge data source
    tissue_data = merge_analysis_data(endo_data, exog_data, covars_df, endo_ids, exog_ids, verbose)
    # run the regressions
    results_df = conditional_regressions(tissue_data, prev_results, verbose)    
    # save the results
    this_out_file = f'{results_dir}/{project}.{endogenous}.{prefix_type}.{tissue}.{REGRESSION_TYPE}.conditioned.age.csv'
    results_df.to_csv(this_out_file, index=False)

### load the GEX results to find which gene features to perform cis conditioning on

In [None]:
endo_results_df = read_csv(endo_results_file)
print(f'shape of GEX results {endo_results_df.shape}')
if DEBUG:
    display(endo_results_df.sample(5))

#### how many genes per cell-type with a results will be considered

In [None]:
print(endo_results_df.feature.nunique())
display(endo_results_df.tissue.value_counts())

### load the ATAC results

In [None]:
exo_results_df = read_csv(exo_results_file)
print(f'shape of ATAC results {exo_results_df.shape}')
if DEBUG:
    display(exo_results_df.sample(5))

#### how many peaks per cell-type

In [None]:
print(exo_results_df.feature.nunique())
display(exo_results_df.tissue.value_counts())

### load the cis proximal correlation results

In [None]:
cis_results_df = read_csv(cis_results_file)
print(f'shape of cis correlation results {cis_results_df.shape}')
if DEBUG:
    display(cis_results_df.sample(5))

In [None]:
display(cis_results_df.tissue.value_counts())

#### subset the cis proximal results to only those features that are age associated

In [None]:
cis_results_df = cis_results_df.loc[(cis_results_df.endo_feature.isin(endo_results_df.feature)) 
                                     & (cis_results_df.exog_feature.isin(exo_results_df.feature))]
print(f'new shape of cis correlation results {cis_results_df.shape}')
if DEBUG:
    display(cis_results_df.sample(5))

In [None]:
display(cis_results_df.tissue.value_counts())

### format sample covariates
sex, ancestry, age, (gex_pool or atac_pool), pmi, ph, smoker, bmi

In [None]:
covars_df = read_csv(info_file, index_col=0)
print(f'shape of covars_df {covars_df.shape}')
if DEBUG:
    display(covars_df.sample(5))

In [None]:
keep_terms = ['sex', 'ancestry', 'age', 'gex_pool', 'atac_pool', 
              'pmi', 'ph', 'smoker', 'bmi']
covars_df = covars_df[keep_terms]

if DEBUG:
    print(covars_df.shape)
    display(covars_df.head())
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

#### fill any missing covariate terms
looks like smoker and bmi is missing for one sample will set it to mean of those values

In [None]:
# fill the non-pool sample values
covars_df.gex_pool = covars_df.gex_pool.fillna('non')
covars_df.atac_pool = covars_df.atac_pool.fillna('non')
covars_df.gex_pool = covars_df.gex_pool.astype('str')
covars_df.atac_pool = covars_df.atac_pool.astype('str')

# fill the missing smoker and bmi value
covars_df.loc[covars_df.smoker.isna(), 'smoker'] = covars_df.smoker.mean().round(1)
covars_df.loc[covars_df.bmi.isna(), 'bmi'] = covars_df.bmi.mean().round(1)

if DEBUG:
    print(covars_df.shape)
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

### conditioned regression for the endo-exo age associated pairs per tissue

In [None]:
%%time
cmds = {}
for tissue in cis_results_df.tissue.unique():
    print(tissue)
    endo_tissue_results = endo_results_df.loc[endo_results_df.tissue == tissue]
    exo_tissue_results = exo_results_df.loc[exo_results_df.tissue == tissue]
    cis_tissue_results = cis_results_df.loc[(cis_results_df.tissue == tissue) & 
                                            (cis_results_df.endo_feature.isin(endo_tissue_results.feature)) &
                                            (cis_results_df.exog_feature.isin(exo_tissue_results.feature))]    
    if TESTING and cis_tissue_results.shape[0] > TEST_FEATURE_SIZE:
        endo_test_features = cis_tissue_results.endo_feature.sample(TEST_FEATURE_SIZE)
        cis_tissue_results = cis_tissue_results.loc[cis_tissue_results.endo_feature.isin(endo_test_features)]
    print((f'endo features {len(set(endo_tissue_results.feature))}, '
           f'exog features {len(set(exo_tissue_results.feature))}, '
           f'pairs shape {cis_tissue_results.shape}'))
    p = Process(target=analyze_tissue,args=(tissue, cis_tissue_results, covars_df))
    p.start()
    # Append process and key to keep track
    cmds[tissue] = p    
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()

In [None]:
!date