## Notebook to identify ATAC peak features that contain risk variants

In [None]:
!date

#### import libraries

In [None]:
from scanpy import read_h5ad
from pandas import read_csv, read_hdf
from pybedtools import BedTool
from seaborn import barplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
disease = 'AD' # 'AD' or 'LBD'
modality = 'ATAC'
category = 'curated_type' # 'curated_type' for broad and 'cluster_name' for specific
REGRESSION_TYPE = 'glm_tweedie'

In [None]:
# naming
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
public_dir = f'{wrk_dir}/public'
results_dir = f'{wrk_dir}/results'

# in files
results_file = f'{results_dir}/{project}.{modality}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
cond_file = f'{results_dir}/{project}.GEX.{prefix_type}.{REGRESSION_TYPE}.conditioned.age.csv'
anndata_file = f'{quants_dir}/{project}.multivi.curated_final.h5ad'
locus_ld_info_file = f'{public_dir}/risk_ld_info/adrd_ipsc_{disease}.ld_prime.csv'
if disease == 'LBD':
    gwas_sum_stats_file = f'{public_dir}/chia_lbd_gwas/GCST90001390_buildGRCh38.tsv.gz'
elif disease == 'AD':
    gwas_sum_stats_file = f'{public_dir}/bellenguez_ad_gwas/GCST90027158_buildGRCh38.tsv.gz'    

# out files
risk_peaks_bed = f'{quants_dir}/{project}_{disease}_risk_peaks.bed'

# variables
DEBUG = True
SUG_THRESHOLD = 1.00e-05
SIG_THRESHOLD = 5.00e-08
NOMINAL_ALPHA = 0.05
if DEBUG:
    print(f'results_file = {results_file}')
    print(f'anndata_file = {anndata_file}')
    print(f'locus_ld_info_file = {locus_ld_info_file}')
    print(f'gwas_sum_stats_file = {gwas_sum_stats_file}')
    print(f'cond_file = {cond_file}')

### load input data

#### load feature annotations

In [None]:
%%time
adata_df = read_h5ad(anndata_file)
print(adata_df)
features_df = adata_df.var.loc[adata_df.var.modality == 'Peaks'].copy()
features_df['id'] = features_df.index
print(features_df.shape)
if DEBUG:
    display(features_df.head())

In [None]:
features_df.modality.value_counts()

#### load summary stats

In [None]:
%%time
gwas_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_df.shape)
if DEBUG:
    display(gwas_df.head())

#### load the LD variants for the loci

In [None]:
ld_df = read_csv(locus_ld_info_file)
print(ld_df.shape)
if DEBUG:
    display(ld_df.head())

In [None]:
gwas_df.head()

#### load the modality's age associated results

In [None]:
age_df = read_csv(results_file)
print(f'shape of {modality} results {age_df.shape}')
if DEBUG:
    display(age_df.sample(5))

### subset summary stats to only those that are signficant or suggestive and in LD with index variants 
may want to have a lower significant threshold to be more fine mappy

In [None]:
risk_df = gwas_df.loc[(gwas_df.p_value <= SIG_THRESHOLD) | 
                      (gwas_df.variant_id.isin(ld_df.SNP_B) & 
                       (gwas_df.p_value <= SUG_THRESHOLD))]
print(risk_df.shape)
if DEBUG:
    display(risk_df.chromosome.value_counts())
    display(risk_df.head())

### find ATAC peak features that contain a risk variant

#### convert ATAC features dataframe to bed

In [None]:
feature_bed = BedTool.from_dataframe(features_df[['chr', 'start', 'end', 'id']])
print(feature_bed.count())
print(feature_bed.field_count())
if DEBUG:
    display(feature_bed.to_dataframe().head())

#### convert summary stats to bed

In [None]:
risk_bed_df = risk_df[['chromosome', 'base_pair_location', 'variant_id', 'p_value']].copy()
risk_bed_df.chromosome = 'chr' + risk_bed_df.chromosome.astype('str')
risk_bed_df.insert(1, 'start', value=risk_bed_df.base_pair_location-1)
risk_bed = BedTool.from_dataframe(risk_bed_df)

print(risk_bed.count())
print(risk_bed.field_count())
if DEBUG:
    display(risk_bed.to_dataframe().head())

#### intersect the beds

In [None]:
feature_intersect = feature_bed.intersect(risk_bed, wb=True)

print(feature_intersect.count())
print(feature_intersect.field_count())
if DEBUG:
    display(feature_intersect.to_dataframe().head())

### save the bed for the ATAC features containing risk variants

In [None]:
risk_features_df = feature_intersect.to_dataframe()
# change default naming back to gwas columns
risk_features_df.rename(columns={'strand': 'm1position', 'thickStart': 'position', 
                                  'thickEnd': 'variant', 'itemRgb': 'pvalue'}, 
                         inplace=True)
risk_features_df.to_csv(risk_peaks_bed, index=False)

In [None]:
risk_features_df.head()

In [None]:
risk_features_df.chrom.value_counts()

### how many age associated ATAC peaks also contain possible disease risk variants

In [None]:
risk_age_results = age_df.loc[age_df.feature.isin(risk_features_df.name)]
print(f'shape of risk_age_results is {risk_age_results.shape}')
if risk_age_results.shape[0] < 20:
    print(f'number of unique peaks is {risk_age_results.feature.nunique()}')
    display(risk_age_results)
else:
    print(f'number of unique peaks is {risk_age_results.feature.nunique()}')
    display(risk_age_results.head())

#### visualize the counts

In [None]:
counts = risk_age_results.groupby('tissue').count()
counts['feature_proportion'] = round(counts.feature/risk_features_df.name.nunique() * 100, 2)
counts = counts.sort_values('feature_proportion', ascending=False)
if DEBUG:
    display(counts)

In [None]:
with rc_context({'figure.figsize': (11, 11), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=counts, x='tissue', y='feature_proportion', color='purple')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title(f'% peaks containing {disease} possible common risk variants that are age associated peaks', fontsize='large')  
    plt.xlabel('Cell types')
    plt.show()    

#### how many of these age associated ATAC peaks contain possible risk variants modulate age associated gene effect

##### load the conditioned age regression results
find the pairwise results where the gene's age associated effect is mediated

In [None]:
cond_df = read_csv(cond_file, index_col=0)
print(f'shape of cond_df is {cond_df.shape}')
cond_df = cond_df.loc[cond_df['p-value'] > NOMINAL_ALPHA]
print(f'shape of cond_df is {cond_df.shape}')
if DEBUG:
    display(cond_df.head())

In [None]:
for row in risk_age_results.itertuples():
    temp = cond_df.loc[(cond_df.exog_feature == row.feature) & (cond_df.tissue == row.tissue)]
    if temp.shape[0] > 0:
        display(temp)

In [None]:
!date