## Notebook to run *cis* correlation analysis between gene expression and ATAC peaks

only interest in gene's with a statistically significant age effect and the ATAC peaks that are <i>cis</i> proximal

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
from scanpy import read_h5ad
from pandas import DataFrame as PandasDF, read_parquet
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
from os.path import exists
import numpy as np
from multiprocessing import Process

import warnings
warnings.filterwarnings('ignore')

#### set notebook variables

In [None]:
# parameters
endogenous = 'GEX'
exogenous = 'ATAC'
category = 'curated_type' # 'curated_type' for broad and 'cluster_name' for specific
REGRESSION_TYPE = 'glm_tweedie'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
quants_dir = f'{wrk_dir}/quants'
figures_dir = f'{wrk_dir}/figures'

# in files
endo_results_file = f'{results_dir}/{project}.{endogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
exo_results_file = f'{results_dir}/{project}.{exogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
anndata_file = f'{quants_dir}/{project}.multivi.curated_final.h5ad'

# out files

# constants
DEBUG = False
MAX_DIST = 1_000_000
covariate_terms = ['sex', 'ancestry', 'pmi', 'ph', 'smoker', 'bmi', 'gex_pool', 'atac_pool']
covar_term_formula = ' + '.join(covariate_terms)
if DEBUG:
    print(covar_term_formula)
TESTING = False
TEST_FEATURE_SIZE = 25    

#### functions

In [None]:
def load_quantification(cell_name: str, modality: str, verbose: bool=False) -> PandasDF:
    this_file = f'{quants_dir}/{project}.{modality}.{prefix_type}.{cell_name}.pb.parquet'
    if not exists(this_file):
        return None
    df = read_parquet(this_file)
    if verbose:
        print(f'shape of read {cell_name} quantifications {df.shape}')        
        display(df.sample(5))
    return df

def glm_model(formula: str, df: PandasDF, model_type: str='rlm'):
    if model_type == 'glm_tweedie':
        model = smf.glm(formula=formula, data=df, 
                        family=sm.families.Tweedie(link=sm.families.links.log(), 
                                                   var_power=1.6, 
                                                   eql=True))
    elif model_type == 'rlm':
        model = smf.rlm(formula=formula, data=df)        
    elif model_type == 'glm':
        model = smf.glm(formula=formula, data=df)        
    result = model.fit()
    return result

def cis_correlation(df: PandasDF, endo_term: str, exog_term: str, verbose: bool=False) -> tuple:
    model_terms = [endo_term, exog_term] + covariate_terms + ['cell_count_endo', 'cell_count_exog']
    this_formula = f'Q("{endo_term}") ~ Q("{exog_term}") + {covar_term_formula} + cell_count_endo + cell_count_exog'
    try:
        # run GLM via statsmodel
        result = glm_model(this_formula, df[model_terms], model_type=REGRESSION_TYPE)
        ret_exog_term = f'Q("{exog_term}")'
        ret_list = [endo_term, exog_term, result.params['Intercept'], 
                    result.params[ret_exog_term], result.bse[ret_exog_term], 
                    result.tvalues[ret_exog_term], result.pvalues[ret_exog_term]]
        if verbose:
            print(f'df shape {df.shape}')
            print(result.summary())
            print(['endo_feature', 'exog_feature', 'intercept', 'coef', 'stderr', 'z', 'p-value'])
            print(ret_list)
    except:
#         print(f'Caught Error for {endo_term}')
        ret_list = [endo_term, exog_term] + [np.nan] * 5
  
    return ret_list


def compute_bh_fdr(df: PandasDF, alpha: float=0.05, p_col: str='p-value',
                   method: str='fdr_bh', verbose: bool=True) -> PandasDF:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, 
                                method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

def load_tissue_quants(tissue: str, endo_ids: set, exog_ids: set, 
                       verbose: bool=False) -> {PandasDF, PandasDF}: 
    endo_data = load_quantification(tissue, endogenous)
    exog_data = load_quantification(tissue, exogenous)
    if verbose:
        print(f'shape of endogenous data {endo_data.shape}')
        print(f'shape of exogenous data {exog_data.shape}')
    # subset to only needed endo and exog features
    endo_data = endo_data[list(set(endo_data.columns) & endo_ids) + ['cell_count']]
    exog_data = exog_data[list(set(exog_data.columns) & exog_ids) + ['cell_count']]
    if verbose:
        print(f'shape of subset endogenous data {endo_data.shape}')
        print(f'shape of subset exogenous data {exog_data.shape}')        
        display(endo_data.sample(4))
        display(exog_data.sample(4))
    return endo_data, exog_data

def merge_analysis_data(endo_data: PandasDF, exog_data: PandasDF, covars_df: PandasDF, 
                        endo_ids: set, exog_ids: set, verbose: bool=False) -> PandasDF:
    # subset to only needed endo and exog features
    endo_data = endo_data[list(set(endo_data.columns) & endo_ids) + ['cell_count']]
    exog_data = exog_data[list(set(exog_data.columns) & exog_ids) + ['cell_count']]
    tissue_data = (endo_data.merge(exog_data, how='inner', left_index=True, right_index=True, suffixes=('_endo', '_exog'))
                   .merge(covars_df, how='inner', left_index=True, right_index=True))
    if verbose:
        print(f'shape of merged data is {tissue_data.shape}')        
        display(tissue_data.sample(5))
    return tissue_data

def regress_tissue(data: PandasDF, pairings: dict, verbose: bool=False) -> PandasDF:
    results = [cis_correlation(data, endo, exo) 
               for endo, exos in pairings.items() 
               for exo in exos]
    results_df = PandasDF(data=results, 
                          columns=['endo_feature', 'exog_feature', 'intercept', 
                                   'coef', 'stderr', 'z', 'p-value'])
    if verbose:
        print(f'shape of results {results_df.shape}')
        display(results_df.sample(5))
    return results_df

def analyze_tissue(tissue: str, endo_feats: PandasDF, exog_feats: PandasDF, 
                   covars_df: PandasDF, endo_ids: set, exog_ids: set, 
                   pairings: dict, verbose: bool=False):
    # load quants data
    endo_data, exog_data = load_tissue_quants(tissue, endo_ids, exog_ids, verbose)
    # merge data source
    tissue_data = merge_analysis_data(endo_data, exog_data, covars_df, endo_ids, exog_ids, verbose)
    # run the regressions
    results_df = regress_tissue(tissue_data, pairings, verbose)    
    # save the results
    cis_out_file = f'{results_dir}/{project}.{endogenous}-{exogenous}.{prefix_type}.{tissue}.{REGRESSION_TYPE}.cis.csv'
    results_df.to_csv(cis_out_file, index=False)

def subset_for_test(pairs_ori: dict, feature_cnt: int) -> PandasDF:
    pairs_to_test = {}
    if len(endo_cis_proximal) < feature_cnt:
        return pairs_ori
    for index, endo_id in enumerate(pairs_ori.keys()):
        if index > feature_cnt:
            break
        pairs_to_test[endo_id] = pairs_ori.get(endo_id)
    return pairs_to_test

### load the GEX results to find which genes in what 'cell-types' should be interegated

In [None]:
endo_results_df = read_csv(endo_results_file)
print(f'shape of GEX results {endo_results_df.shape}')
if DEBUG:
    display(endo_results_df.sample(5))

#### how many genes per cell-type with a results will be considered

In [None]:
print(endo_results_df.feature.nunique())
display(endo_results_df.tissue.value_counts())

### load the ATAC results

In [None]:
exo_results_df = read_csv(exo_results_file)
print(f'shape of GEX results {exo_results_df.shape}')
if DEBUG:
    display(exo_results_df.sample(5))

#### how many peaks per cell-type

In [None]:
print(exo_results_df.feature.nunique())
display(exo_results_df.tissue.value_counts())

### load the anndata file
easy and combine place to get genomic location of possible features; from the var

In [None]:
adata = read_h5ad(anndata_file)
print(adata)
if DEBUG:
    display(adata.var.loc[adata.var.modality == 'Gene Expression'].sample(4))
    display(adata.var.loc[adata.var.modality == 'Peaks'].sample(4))

### format sample covariates
sex, ancestry, age, (gex_pool or atac_pool), pmi, ph, smoker, bmi

In [None]:
keep_terms = ['sample_id','sex', 'ancestry', 'age', 'gex_pool', 'atac_pool', 
              'pmi', 'ph', 'smoker', 'bmi']
covars_df = adata.obs[keep_terms].drop_duplicates().reset_index(drop=True)
covars_df = covars_df.set_index('sample_id')

if DEBUG:
    print(covars_df.shape)
    display(covars_df.head())
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

#### fill any missing covariate terms
looks like smoker and bmi is missing for one sample will set it to mean of those values

In [None]:
# fill the missing smoker and bmi value
covars_df.loc[covars_df.smoker.isna(), 'smoker'] = covars_df.smoker.mean().round(1)
covars_df.loc[covars_df.bmi.isna(), 'bmi'] = covars_df.bmi.mean().round(1)

if DEBUG:
    print(covars_df.shape)
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

### identify the cis proximal possible pairs regardless of cell-type

In [None]:
endo_features = (adata.var[adata.var.index.isin(endo_results_df.feature)]
                 [['ID', 'chr', 'start', 'end']].copy())
print(f'shape of endo features {endo_features.shape}')
if DEBUG:
    display(endo_features.sample(4))

In [None]:
if exogenous == 'ATAC':
    exo_features = (adata.var[adata.var.modality == 'Peaks']
                    [['ID', 'modality', 'chr', 'start', 'end']].copy())
else:
    # all features regardless of modality
    exo_features = adata.var[['ID', 'modality', 'chr', 'start', 'end']].copy()
print(f'shape of exo features {endo_features.shape}')
if DEBUG:
    display(exo_features.modality.value_counts())
    display(exo_features.sample(4))

In [None]:
%%time
endo_cis_proximal = {}
for chrom in endo_features.chr.unique():
    chrom_endo = endo_features.loc[endo_features.chr == chrom]
    chrom_exo = exo_features.loc[exo_features.chr == chrom]
    print(chrom, chrom_endo.shape, chrom_exo.shape)
    for endo in chrom_endo.itertuples():
        # Calculate boundaries with proper handling of edge cases
        start_boundary = np.maximum(endo.start - MAX_DIST, chrom_exo.start.min())
        end_boundary = np.minimum(endo.end + MAX_DIST, chrom_exo.end.max())
        # Filter based on boundaries
        found_df = chrom_exo.loc[(chrom_exo.start >= start_boundary) & (chrom_exo.end <= end_boundary)]
        endo_cis_proximal[endo.Index] = found_df.ID.to_list()

In [None]:
# how many tests will be done
endo_count = 0
exo_count = 0
peak_features = set()
for endo, exos in endo_cis_proximal.items():
    endo_count += 1
    exo_count += len(exos)
    peak_features = peak_features | set(exos)
print(f'unique endo count {endo_count}')
print(f'total endo exo comparisons {exo_count}')
print(f'toal unique exo counts {len(peak_features)}')

### regression the endo-exo pairs possible per tissue

In [None]:
%%time
cmds = {}
for tissue in endo_results_df.tissue.unique():
    print(tissue)
    if TESTING:
        endo_cis_proximal = subset_for_test(endo_cis_proximal, TEST_FEATURE_SIZE)
        print(endo_cis_proximal.keys())  
    p = Process(target=analyze_tissue,args=(tissue, endo_features, exo_features,
                                            covars_df, set(endo_features.index.unique()), 
                                            peak_features, endo_cis_proximal))
    p.start()
    # Append process and key to keep track
    cmds[tissue] = p    
    # diffexp_group(adata_sub, cell_name)
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()

In [None]:
!date