## Notebook to look at a specific cis proximal GEX ~ ATAC regression result

In [None]:
!date

#### import libraries

In [None]:
from anndata import AnnData
import numpy as np
from pandas import (DataFrame as PandasDF, concat, read_csv, Series, read_parquet, 
                    set_option as pd_set_option)
# import scanpy as sc
from scanpy import read_h5ad
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
from seaborn import scatterplot, lmplot, displot
from matplotlib.pyplot import rc_context
import json
from os.path import exists
from sklearn.preprocessing import MinMaxScaler
import statsmodels.formula.api as smf
import statsmodels.api as sm

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
endogenous = 'GEX'
exogenous = 'ATAC'
category = 'curated_type' # 'curated_type' for broad and 'cluster_name' for specific
endo_feature = 'IL10RB'
exog_feature = 'chr21:32451031-32451848'
cell_type = 'Astro'
regression_type = 'glm_tweedie' # 'glm', 'glm_tweedie', or 'rlm'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files
anndata_file = f'{quants_dir}/{project}.multivi.curated_final.h5ad'  
in_file = f'{results_dir}/{project}.{endogenous}-{exogenous}.{prefix_type}.{cell_type}.{regression_type}.cis.csv'

# out files

# constants
DEBUG = True
pd_set_option('display.max_rows', 500)
covariate_terms = ['sex', 'ancestry', 'pmi', 'ph', 'smoker', 'bmi', 'gex_pool', 'atac_pool']
covar_term_formula = ' + '.join(covariate_terms)

if DEBUG:
    print(anndata_file)
    print(in_file)
    print(covar_term_formula)

#### functions

In [None]:
def load_quantification(cell_name: str, modality: str, verbose: bool=False) -> PandasDF:
    this_file = f'{quants_dir}/{project}.{modality}.{prefix_type}.{cell_name}.pb.parquet'
    if not exists(this_file):
        return None
    df = read_parquet(this_file)
    if verbose:
        print(f'shape of read {cell_name} quantifications {df.shape}')        
        display(df.sample(5))
    return df

def load_tissue_quants(tissue: str, endo_ids: set, exog_ids: set, 
                       verbose: bool=False) -> {PandasDF, PandasDF}: 
    endo_data = load_quantification(tissue, endogenous)
    exog_data = load_quantification(tissue, exogenous)
    if verbose:
        print(f'shape of endogenous data {endo_data.shape}')
        print(f'shape of exogenous data {exog_data.shape}')
    # subset to only needed endo and exog features
    endo_data = endo_data[list(set(endo_data.columns) & endo_ids) + ['cell_count']]
    exog_data = exog_data[list(set(exog_data.columns) & exog_ids) + ['cell_count']]
    if verbose:
        print(f'shape of subset endogenous data {endo_data.shape}')
        print(f'shape of subset exogenous data {exog_data.shape}')        
        display(endo_data.sample(4))
        display(exog_data.sample(4))
    return endo_data, exog_data

def glm_model(formula: str, df: PandasDF, model_type: str='glm'):
    if model_type == 'glm_tweedie':
        model = smf.glm(formula=formula, data=df, 
                        family=sm.families.Tweedie(link=sm.families.links.log(), 
                                                   var_power=1.6, 
                                                   eql=True))
    elif model_type == 'rlm':
        model = smf.rlm(formula=formula, data=df)        
    elif model_type == 'glm':
        model = smf.glm(formula=formula, data=df)        
    result = model.fit()
    return result

def cis_correlation(df: PandasDF, endo_term: str, exog_term: str, 
                    model_type: str='glm', verbose: bool=False) -> tuple:
    model_terms = [endo_term, exog_term] + covariate_terms + ['cell_count_endo', 'cell_count_exog']
    this_formula = f'Q("{endo_term}") ~ Q("{exog_term}") + {covar_term_formula} + cell_count_endo + cell_count_exog'
    try:
        # run GLM via statsmodel
        result = glm_model(this_formula, df[model_terms], model_type)
        ret_exog_term = f'Q("{exog_term}")'
        ret_list = [endo_term, exog_term, result.params['Intercept'], 
                    result.params[ret_exog_term], result.bse[ret_exog_term], 
                    result.tvalues[ret_exog_term], result.pvalues[ret_exog_term]]
        if verbose:
            print(f'df shape {df.shape}')
            print(result.summary())
            print(['endo_feature', 'exog_feature', 'intercept', 'coef', 'stderr', 'z', 'p-value'])
            print(ret_list)
    except:
#         print(f'Caught Error for {endo_term}')
        ret_list = [endo_term, exog_term] + [np.nan] * 5
  
    return ret_list

def merge_analysis_data(endo_data: PandasDF, exog_data: PandasDF, covars_df: PandasDF, 
                        endo_ids: set, exog_ids: set, verbose: bool=False) -> PandasDF:
    # subset to only needed endo and exog features
    endo_data = endo_data[list(set(endo_data.columns) & endo_ids) + ['cell_count']]
    exog_data = exog_data[list(set(exog_data.columns) & exog_ids) + ['cell_count']]
    tissue_data = (endo_data.merge(exog_data, how='inner', left_index=True, right_index=True, suffixes=('_endo', '_exog'))
                   .merge(covars_df, how='inner', left_index=True, right_index=True))
    if verbose:
        print(f'shape of merged data is {tissue_data.shape}')        
        display(tissue_data.sample(5))
    return tissue_data

def show_pair(tissue: str, covars_df: PandasDF, endo_id: str, exog_id: str,
              model_type: str='glm', verbose: bool=True):
    # this weird set from single ID is just so I can re-use same functions
    endo_ids = set([endo_id])
    exo_ids = set([exog_id])    
    # load quants data
    endo_data, exog_data = load_tissue_quants(tissue, endo_ids, exo_ids, verbose)
    # merge data source
    tissue_data = merge_analysis_data(endo_data, exog_data, covars_df, endo_ids, exo_ids, verbose)
    # run the regressions
    cis_correlation(tissue_data, endo_id, exog_id, model_type, verbose)
    # plot the pair
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': 50}):
        plt.style.use('seaborn-v0_8-talk')
        lmplot(x=exog_id, y=endo_id, data=tissue_data, palette='Purples', 
               scatter_kws={'s': tissue_data.age}, robust=True)
        plt.title(f'{endo_id} ~ {exog_id}', fontsize='large') 
        plt.xlabel(exog_id)
        plt.ylabel(endo_id)        
        plt.show()
        lmplot(x=exog_id, y=endo_id, data=tissue_data, palette='Purples', 
               scatter_kws={'s': tissue_data.age})
        plt.show()
        displot(tissue_data[endo_id], kind='kde')
        plt.show()        
        displot(tissue_data[exog_id], kind='kde')        
        plt.show()        
        scatterplot(x=exog_id, y=endo_id, size='cell_count_endo', data=tissue_data)
        plt.show()
        scatterplot(x=exog_id, y=endo_id, size='cell_count_exog', data=tissue_data)
        plt.show()        

### read the specified result

In [None]:
%%time
if exists(in_file):
    glm_results = read_csv(in_file)
print(f'read {glm_results.shape} results ')
display(glm_results.loc[(glm_results.endo_feature == endo_feature) &
                        (glm_results.exog_feature == exog_feature)])


### load data
read the anndata (h5ad) file

In [None]:
%%time
adata = read_h5ad(anndata_file)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

#### take a look at the cell counts by cell type

In [None]:
display(adata.obs[category].value_counts())

### format sample covariates

sex, ancestry, age, (gex_pool or atac_pool), pmi, ph, smoker, bmi

In [None]:
keep_terms = ['sample_id','sex', 'ancestry', 'age', 'gex_pool', 'atac_pool', 
              'pmi', 'ph', 'smoker', 'bmi']
covars_df = adata.obs[keep_terms].drop_duplicates().reset_index(drop=True)
covars_df = covars_df.set_index('sample_id')

if DEBUG:
    print(covars_df.shape)
    display(covars_df.head())
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

#### fill any missing covariate terms
looks like smoker and bmi is missing for one sample will set it to mean of those values

In [None]:
# fill the missing smoker and bmi value
covars_df.loc[covars_df.smoker.isna(), 'smoker'] = covars_df.smoker.mean().round(1)
covars_df.loc[covars_df.bmi.isna(), 'bmi'] = covars_df.bmi.mean().round(1)

if DEBUG:
    print(covars_df.shape)
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

### view results under GLM Tweedie analysis

In [None]:
%%time
show_pair(cell_type, covars_df, endo_feature, exog_feature, 'glm_tweedie')

### view results under GLM analysis

In [None]:
%%time
show_pair(cell_type, covars_df, endo_feature, exog_feature, 'glm')

### view results under RLM analysis

In [None]:
%%time
show_pair(cell_type, covars_df, endo_feature, exog_feature, 'rlm')

In [None]:
!date