## Combine and summarize the cis conditioned regression analysis, the age regression for the age associated GEX features conditioned *cis* correlated ATAC features that are also age associated

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat, DataFrame
from os.path import exists
from seaborn import scatterplot, barplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
endogenous = 'GEX'
exogenous = 'ATAC'
category = 'cluster_name' # 'curated_type' for broad and 'cluster_name' for specific
REGRESSION_TYPE = 'glm_tweedie'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
quants_dir = f'{wrk_dir}/quants'
figures_dir = f'{wrk_dir}/figures'
info_dir = f'{wrk_dir}/sample_info'

# in files
endo_results_file = f'{results_dir}/{project}.{endogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
exo_results_file = f'{results_dir}/{project}.{exogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
cis_results_file = f'{results_dir}/{project}.{endogenous}-{exogenous}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.cis.csv'
info_file = f'{info_dir}/{project}.sample_info.csv'

# out files
out_file = f'{results_dir}/{project}.{endogenous}.{prefix_type}.{REGRESSION_TYPE}.pairwise_conditioned.age.csv'
fig_filename = f'{figures_dir}/{project}.{endogenous}.{prefix_type}.{REGRESSION_TYPE}.pairwise_conditioned.summary.png'
bar_fig_filename = f'{figures_dir}/{project}.{endogenous}.{prefix_type}.{REGRESSION_TYPE}.pairwise_conditioned.summary_bar.png'

# constants
DEBUG = False
ALPHA = 0.05
if DEBUG:
    print(f'endo_results_file = {endo_results_file}')
    print(f'exo_results_file = {exo_results_file}')
    print(f'cis_results_file = {cis_results_file}')
    print(f'info_file = {info_file}')
    print(f'out_file = {out_file}')

### load the GEX results to find which gene features to perform cis conditioning on

In [None]:
endo_results_df = read_csv(endo_results_file)
print(f'shape of GEX results {endo_results_df.shape}')
if DEBUG:
    display(endo_results_df.sample(5))

#### how many genes per cell-type with a results will be considered

In [None]:
print(endo_results_df.feature.nunique())
display(endo_results_df.tissue.value_counts())

### load the ATAC results

In [None]:
exo_results_df = read_csv(exo_results_file)
print(f'shape of ATAC results {exo_results_df.shape}')
if DEBUG:
    display(exo_results_df.sample(5))

#### how many peaks per cell-type

In [None]:
print(exo_results_df.feature.nunique())
display(exo_results_df.tissue.value_counts())

### load the cis proximal correlation results

In [None]:
cis_results_df = read_csv(cis_results_file)
print(f'shape of cis correlation results {cis_results_df.shape}')
if DEBUG:
    display(cis_results_df.sample(5))

In [None]:
display(cis_results_df.tissue.value_counts())

#### subset the cis proximal results to only those features that are age associated

In [None]:
cis_results_df = cis_results_df.loc[(cis_results_df.endo_feature.isin(endo_results_df.feature)) 
                                     & (cis_results_df.exog_feature.isin(exo_results_df.feature))]
print(f'new shape of cis correlation results {cis_results_df.shape}')
if DEBUG:
    display(cis_results_df.sample(5))

In [None]:
display(cis_results_df.tissue.value_counts())

### load the results of the cis conditioned analysis

In [None]:
cond_results = []
for cell_type in endo_results_df.tissue.unique():
    print(cell_type)
    this_file = f'{results_dir}/{project}.{endogenous}.{prefix_type}.{cell_type}.{REGRESSION_TYPE}.pairwise_conditioned.age.csv'
    if exists(this_file):
        this_results = read_csv(this_file)
        this_results['tissue'] = cell_type
        this_results['type'] = prefix_type    
        cond_results.append(this_results)
# convert list of result dataframes to single dataframe
cond_results_df = concat(cond_results)
print(f'shape of cond_results_df is {cond_results_df.shape}')
if DEBUG:
    display(cond_results_df.sample(5))

### save the combine results

In [None]:
cond_results_df.to_csv(out_file)

### summary the results of the the conditioned regression

after conditioning the age associated gene on cis proximal ATAC peaks that are also age associated and correlated with the gene does an age effect remian

In [None]:
# stub summary result metrics dataframe
summary_df = endo_results_df[['feature', 'tissue']].drop_duplicates(keep='first').copy()
print(f'shape of summary_df is {summary_df.shape}')
if DEBUG:
    display(summary_df.sample(5))

#### compute in some of the metrics

In [None]:
%%time
summary_df['cis_cor_peaks'] = 0
summary_df['cis_cor_age_peaks'] = 0
summary_df['mediating_peak_count'] = 0
for row in summary_df.itertuples():
    # how many cis correlated peaks for gene feature and cell-type
    this_corr_result = cis_results_df.loc[(cis_results_df.endo_feature == row.feature) & (cis_results_df.tissue == row.tissue)]
    if this_corr_result.shape [0] > 0:
        summary_df.loc[(summary_df.feature == row.feature) & 
                        (summary_df.tissue == row.tissue), 
                        'cis_cor_peaks'] = this_corr_result.shape[0]
    # how many cis correlated age associated peaks for gene feature and cell-type
    this_cond_result = cond_results_df.loc[(cond_results_df.endo_feature == row.feature) & (cond_results_df.tissue == row.tissue)]
    if this_cond_result.shape[0] > 0:
        summary_df.loc[(summary_df.feature == row.feature) & 
                       (summary_df.tissue == row.tissue), 
                       'cis_cor_age_peaks'] = this_cond_result.shape[0]
        # how many cis correlated age associated peaks that mediate age effect for this gene feature and cell-type                
        this_mediated_result = this_cond_result.loc[this_cond_result['p-value'] > ALPHA]
        if this_mediated_result.shape[0] > 0:
            summary_df.loc[(summary_df.feature == row.feature) & 
                           (summary_df.tissue == row.tissue), 
                           'mediating_peak_count'] = this_mediated_result.shape[0]


#### visualize the summary

In [None]:
with rc_context({'figure.figsize': (11, 11), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    scatterplot(data=summary_df.sample(frac=1), x='cis_cor_peaks', y='mediating_peak_count', 
                hue='tissue', size='cis_cor_age_peaks', palette='bright')
    plt.legend(bbox_to_anchor=(1.4, 1), loc='upper right', borderaxespad=0)
    plt.tight_layout()
    plt.savefig(fig_filename)    
    plt.show()

In [None]:
this_list = []
for cell_type in summary_df.tissue.unique():
    tissue_summary_df = summary_df.loc[summary_df.tissue == cell_type]
    cell_type_cnt = tissue_summary_df.feature.nunique()
    mediated_percent = tissue_summary_df.loc[tissue_summary_df.mediating_peak_count > 0].shape[0]/cell_type_cnt * 100 
    # print(f'{cell_type} has {cell_type_cnt} age associated genes and {mediated_percent:.2f} are mediated by cis correlated age associated ATAC peaks')
    this_list.append([cell_type, cell_type_cnt, mediated_percent])
mediated_proportions = DataFrame(this_list, columns=['tissue', 'count', 'percent'])
print(f'shape of mediated_proportions is {mediated_proportions.shape}')
if DEBUG:
    display(mediated_proportions)

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=mediated_proportions.sort_values('percent', ascending=False),
            x='tissue', y='percent', color='purple')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title('% of age associated genes that are mediated by a cis correlated age associated ATAC peak')  
    plt.xlabel('Cell types')
    plt.savefig(bar_fig_filename)  
    plt.show()

In [None]:
!date