## Notebook to look at a specific result

In [None]:
!date

#### import libraries

In [None]:
from anndata import AnnData
import numpy as np
from pandas import DataFrame, concat, read_csv, Series, read_parquet, set_option as pd_set_option
# import scanpy as sc
from scanpy import read_h5ad
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
from seaborn import scatterplot, lmplot, displot
from matplotlib.pyplot import rc_context
import json
from os.path import exists
from sklearn.preprocessing import MinMaxScaler
import statsmodels.formula.api as smf
import statsmodels.api as sm

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
modality = 'GEX' # 'GEX' or 'ATAC'
category = 'curated_type' # 'curated_type' for broad and 'cluster_name' for specific
feature = 'SNTB2'
cell_type = 'InN'
REGRESSION_TYPE = 'glm_tweedie' # 'glm', 'glm_tweedie', or 'rlm'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files
anndata_file = f'{quants_dir}/{project}.multivi.curated_final.h5ad'  

# out files

# constants
DEBUG = True
pd_set_option('display.max_rows', 500)

### read the specified result

In [None]:
in_file = f'{results_dir}/{project}.{modality}.{prefix_type}.{cell_type}.{REGRESSION_TYPE}.age.csv'
if exists(in_file):
    glm_results = read_csv(in_file)
print(f'read {glm_results.shape} results ')
display(glm_results.loc[(glm_results.feature == feature) & 
                        (glm_results.tissue == cell_type) & 
                        (glm_results.type == category)])

### load quantified data

In [None]:
%%time
this_file = f'{quants_dir}/{project}.{modality}.{prefix_type}.{cell_type}.pb.parquet'
if exists(this_file):
    quants_df = read_parquet(this_file)
print(f'shape of read {cell_type} quantifications {quants_df.shape}')        
if DEBUG:
    display(quants_df[feature])


### load data
read the anndata (h5ad) file

In [None]:
%%time
adata = read_h5ad(anndata_file)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

#### take a look at the cell counts by cell type

In [None]:
display(adata.obs[category].value_counts())

### format sample covariates

sex, ancestry, age, (gex_pool or atac_pool), pmi, ph, smoker, bmi

In [None]:
keep_terms = ['sample_id','sex', 'ancestry', 'age', 'gex_pool', 'atac_pool', 
              'pmi', 'ph', 'smoker', 'bmi']
covars_df = adata.obs[keep_terms].drop_duplicates().reset_index(drop=True)
covars_df = covars_df.set_index('sample_id')

if DEBUG:
    print(covars_df.shape)
    display(covars_df.head())
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

#### fill any missing covariate terms
looks like smoker and bmi is missing for one sample will set it to mean of those values

In [None]:
# fill the missing smoker and bmi value
covars_df.loc[covars_df.smoker.isna(), 'smoker'] = covars_df.smoker.mean().round(1)
covars_df.loc[covars_df.bmi.isna(), 'bmi'] = covars_df.bmi.mean().round(1)

if DEBUG:
    print(covars_df.shape)
    display(covars_df.info())
    display(covars_df.smoker.value_counts())
    display(covars_df.bmi.describe())

#### set the pool term based on modality being analyzed

In [None]:
if modality == 'GEX':
    covars_df['pool'] = covars_df.gex_pool
elif modality == 'ATAC':
    covars_df['pool'] = covars_df.atac_pool
covars_df = covars_df.drop(columns=['gex_pool', 'atac_pool'])
print(f'shape of covariate terms is {covars_df.shape}')
if DEBUG:
    display(covars_df.head(40))

### merge the covariates with the feature of interest

In [None]:
quants_df[[feature, 'cell_count']]
data_df = quants_df[[feature, 'cell_count']].merge(covars_df, how='inner', 
                                                   left_index=True, right_index=True)
print(data_df.shape)
if DEBUG:
    display(data_df)

### check if age, exogenous variable, is correlated with any ouf the covariate terms

none of the terms appear to have a statistically significant correlation with age

In [None]:
covariate_terms = ['sex', 'ancestry', 'pmi', 'ph', 'smoker', 'bmi', 'pool']
covar_term_formula = ' + '.join(covariate_terms)
this_formula = f'age ~ cell_count + {covar_term_formula}'
print(this_formula)
# with GLM
model = smf.glm(formula=this_formula, data=data_df)
result = model.fit()
display(result.summary())
# with RLM
model = smf.rlm(formula=this_formula, data=data_df)
result = model.fit()
display(result.summary())

### regression results when using GLM and Tweedie distribution

In [None]:
endo_term = feature
exog_term = 'age'
this_formula = f'Q("{endo_term}") ~ {exog_term} + {covar_term_formula} + cell_count'
model = smf.glm(formula=this_formula, data=data_df, 
                family=sm.families.Tweedie(link=sm.families.links.log(), 
                                           var_power=1.6, eql=True))
result = model.fit()
print(result.summary())
print(['feature', 'intercept', 'coef', 'stderr', 'z', 'p-value'])
ret_list = [endo_term, result.params['Intercept'], 
            result.params[exog_term], result.bse[exog_term], 
            result.tvalues[exog_term], result.pvalues[exog_term]]
print(ret_list)

### regression results when using GLM without Tweedie distribution

In [None]:
model = smf.glm(formula=this_formula, data=data_df)
result = model.fit()
print(result.summary())
print(['feature', 'intercept', 'coef', 'stderr', 'z', 'p-value'])
ret_list = [endo_term, result.params['Intercept'], 
            result.params[exog_term], result.bse[exog_term], 
            result.tvalues[exog_term], result.pvalues[exog_term]]
print(ret_list)

### regression results when using RLM

In [None]:
model = smf.rlm(formula=this_formula, data=data_df)
result = model.fit()
print(result.summary())
print(['feature', 'intercept', 'coef', 'stderr', 'z', 'p-value'])
ret_list = [endo_term, result.params['Intercept'], 
            result.params[exog_term], result.bse[exog_term], 
            result.tvalues[exog_term], result.pvalues[exog_term]]
print(ret_list)

In [None]:
displot(data_df[feature], kind='kde')

In [None]:
scatterplot(x='age', y=feature, size='cell_count', data=data_df)

In [None]:
scatterplot(x=feature, y='cell_count', data=data_df)

In [None]:
lmplot(x='age', y=feature, data=data_df, robust=True)

In [None]:
lmplot(x='age', y=feature, data=data_df)