## Notebook to run differential expression in single-cell data using GLM model and pseudo-bulk quantifications per sample

based on some of the observations related to pseudo-replicate and zero-inflation from

[Zimmerman KD, Espeland MA, Langefeld CD. A practical solution to pseudoreplication bias in single-cell studies. Nat Commun 2021;12:738.](https://pubmed.ncbi.nlm.nih.gov/33531494/)


In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import diffxpy.api as de
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from numba import jit

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

In [None]:
# parameters
tissue = ''
tissue_type = ''
testing = False

In [None]:
# naming
cohort = 'aging'
assay = 'RNA'

# directories for initial setup
home_dir = '/labshare/raph/datasets/adrd_neuro'
wrk_dir = f'{home_dir}/{cohort}'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'

# in files
in_file = f'{quants_dir}/{cohort}.pegasus.leiden_085.Age_group_young_old.h5ad'
diffxpy_file = '{this_dir}/{name}_de_diffxpy.csv'

# out files
results_file = f'{results_dir}/{cohort}.{tissue.replace(" ", "_")}.glm_pb_age_diffs.csv'

# constants
young_age_limit = 30.0
min_cell_count = 3
# get cell counts by 'categories'
region_obs_feature = 'Brain_region'
celltype_obs_feature = 'new_anno'
testing_sample_size = 100 #25


#### analysis functions

In [None]:
def subset_ad_by_type(data: ad.AnnData, group_name: str, type_name: str,
                      reapply_filter: bool=True, min_cell_count: int=3,
                      verbose: bool=False) -> ad.AnnData:
    this_data = data[data.obs[type_name] == group_name].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'subset complete, shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data

def convert_ad_to_df(data: ad.AnnData, young_age_limit: float=30.0, 
                     verbose: bool=False) -> pd.DataFrame:
    data_df = data.to_df()
    annots = data.obs[['Brain_region', 'Age','Age_group', 'pool_name', 
                       'Sample_id', 'Sex', 'donor_id']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['female'] = np.where((annots['Sex'] == 'Female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = pd.concat([data_df, annots], axis='columns')
        if verbose:
            print(f'anndata to pandas df complete: {this_df.shape}')
            print(this_df.shape)
            display(this_df.head())
    return this_df

# have added numba jit decorator but unfortunately since using pandas still 
# object mode, but maybe get some improvement on the looping?
@jit(parallel=True)
def feature_detected(feature: str=None, df: pd.DataFrame=None, 
                     min_cell_count: int=3, min_sample_det_rate: float=0.5,
                     verbose: bool=False) -> bool:
    nz_df = df.loc[df[feature] > 0]
    ok_cnts = nz_df['Sample_id'].value_counts() > min_cell_count
    ok_sample_cnt = ok_cnts[ok_cnts == True].shape[0]
    if ok_sample_cnt/df['Sample_id'].nunique() >= min_sample_det_rate:
        good_feature = True
    else:
        good_feature = False
    if verbose:
        print(feature, end=', ')
        print(f'nz_df.shape = {nz_df.shape}', end=', ')
        print(f'{ok_sample_cnt}/{df.Sample_id.nunique()}', end=', ')
        print(good_feature)
    return good_feature

# have added numba jit decorator but unfortunately since using pandas still 
# object mode, but maybe get some improvement on the looping?
@jit(parallel=True)
def poorly_detected_features(features: list=None, df: pd.DataFrame=None, 
                             verbose=False) -> list:
    bad_features = []
    for feature in features:
        if not feature_detected(feature, df, verbose):
            bad_features.append(feature)
    if verbose:
        print(f'bad features counts is {len(bad_features)}')
    return bad_features

def non_de_features(data: ad.AnnData, group_name: str, alpha: float=0.05) -> list:
    de_tt = de.test.t_test(data=data, grouping='Age_group')
    de_tt_summary = de_tt.summary()
    ret_list = list(de_tt_summary[de_tt_summary['pval'] > alpha]['gene'].values)
    return ret_list

def read_testable_de_features(group_name: str, alpha: float=0.05) -> pd.DataFrame:
    de_df = pd.read_csv(diffxpy_file.format(this_dir=quants_dir, 
                                            name=group_name.replace(" ", "_")), index_col=0)
    ret_list = list(de_df[de_df['pval'] <= alpha]['gene'].values)    
    return ret_list

def glm_model(formula: str, df: pd.DataFrame, verbose: bool=False, 
                      use_tweedie: bool=True):
    if use_tweedie:
        model = smf.glm(formula=formula, data=df, 
                        family=sm.families.Tweedie(link=None, var_power=1.6, 
                                                   eql=True))
    else:
        model = smf.glm(formula=formula, data=df)
    result = model.fit()
    if verbose:
        print(result.summary())
    return result

@jit(nopython=True)
def compute_fold_change(intercept: float, coef: float) -> float:
    if coef > 0:
        fc = np.log2((intercept + coef)/intercept)
    else:
        fc = -np.log2(intercept/(intercept - abs(coef)))
    return fc

def compute_frmt_pb(df: pd.DataFrame, feature: str) -> pd.DataFrame:
    ret_df = df[[feature, 'Sample_id']].groupby('Sample_id').mean()
    ret_df = ret_df.merge(df[['Sample_id', 'pool_name', 'old']].drop_duplicates(), 
                          how='left', left_index=True, right_on='Sample_id')
    return ret_df

def glm_diff_expr_age(df: pd.DataFrame, feature: str, verbose: bool=False) -> tuple:
    dep_term = feature
    indep_term = 'old'
    this_formula = f'Q("{dep_term}") ~ {indep_term} + C(pool_name)'
    # just drop zeros 
    try:
        pb_df = compute_frmt_pb(df, feature)
            # run GLM via statsmodel
        result = glm_model(this_formula, pb_df, use_tweedie=False)
        fold_change = compute_fold_change(result.params['Intercept'], 
                                          result.params[indep_term])
        ret_list = [dep_term, result.params['Intercept'], 
                    result.params[indep_term], result.bse[indep_term], 
                    result.tvalues[indep_term], result.pvalues[indep_term], 
                    fold_change]
        if verbose:
            print(f'df shape {df.shape}')
            print(f'non-zero df shape {pb_df.shape}')
            print(result.summary())
            print(['feature', 'intercept', 'coef', 'stderr', 'z', 'p-value', 'log2_fc'])
            print(ret_list)
    except:
#         print(f'Caught Error for {dep_term}')
        ret_list = [dep_term] + [np.nan] * 6
  
    return ret_list

def diff_exp_of_features(df: pd.DataFrame, features: set) -> list:
    results = []
    for feature in features:
        results.append(glm_diff_expr_age(df, feature))         
    return results

def diffexp_group(data: ad.AnnData, group_name: str, 
                  type_name: str, min_cell_count: int=3, 
                  verbose: bool=False, use_prev_prep: bool=True) -> pd.DataFrame:
    if verbose:
        print(f'starting subset {group_name}')
    # subset anndata object by type_name (brain region or cell-type)
    type_ad = subset_ad_by_type(data, group_name, type_name)
    # now dealing with just cells in this region or cell-type so refilter genes
    sc.pp.filter_genes(type_ad, min_cells=min_cell_count)
    if verbose:
        print('converting anndata to pandas df')        
    type_df = convert_ad_to_df(type_ad)
    # if prep files exist from the glmmTMB analysis
    if use_prev_prep:
        print('using previous prep to find features for glm modeling')
        testable_features = read_testable_de_features(group_name)
        exclude_features = set(type_ad.var.index) - set(testable_features)
        type_clean_df = type_df.drop(columns=exclude_features)
        features_set = set(type_ad.var.index) & set(type_clean_df.columns)
    else:
        # find features poorly detected and don't include in analysis
        if verbose:
            print(f'finding poorly detected features from cells x features {type_df.shape}')    
        bad_features = poorly_detected_features(type_ad.var.index.values, type_df)
        type_clean_df = type_df.drop(columns=bad_features)
        keep_features = set(type_ad.var.index) & set(type_clean_df.columns)
        type_clean_ad = type_ad[:,list(keep_features)] 
        # compute simple DE, so only running slower glmmTMB on possibles
        if verbose:
            print('running diffxpy t-test screen')    
        exclude_features = non_de_features(type_clean_ad, group_name)
        type_clean_df = type_clean_df.drop(columns=exclude_features)
        features_set = set(type_clean_ad.var.index) & set(type_clean_df.columns)    
    type_results = diff_exp_of_features(type_clean_df, features_set)
    results_df = pd.DataFrame(data=type_results, 
                              columns=['feature', 'intercept', 'coef', 
                                       'stderr', 'z', 'p-value', 'log2_fc'])
    results_df['tissue'] = group_name
    results_df['type'] = 'brain_region' if type_name == region_obs_feature else 'cell_type'       
    if verbose:
        print(f'done', end='. ')
    return results_df

#### read the anndata (h5ad) file

In [None]:
%%time
adata = sc.read(in_file, cache=True)

print(adata)

#### take a look at the cell counts by cell type

In [None]:
# get cell counts by 'categories'
categories = ['Brain_region', 'new_anno']
for this_cat in categories:
    print(adata.obs[this_cat].value_counts())

#### get sample counts per age group by  counts by 'categories'

In [None]:
for this_cat in categories:
    print(adata.obs.groupby([this_cat,'Age_group'])['Sample_id'].nunique())

In [None]:
# sc.pl.umap(adata, color=[celltype_obs_feature], legend_loc='on data')
with rc_context({'figure.figsize': (12, 12)}):
    sc.pl.umap(adata, color=[celltype_obs_feature], legend_loc='on data', 
               add_outline=True, legend_fontsize=10)

##### find cell-types we won't use in analysis
remove them, and then refilter genes based on cell count

In [None]:
found_uncertain = [x for x in adata.obs[celltype_obs_feature].unique().to_list() 
                   if 'uncertain' in x] + ['Astrocyte-GFAP-Hi']
print(found_uncertain)
adata = adata[~adata.obs[celltype_obs_feature].isin(found_uncertain ), :]
sc.pp.filter_genes(adata, min_cells=min_cell_count)
adata

### if testing notebooks for debugging purpose subset the features

In [None]:
if testing:
    genes = random.sample(list(adata.var.index.values), testing_sample_size)
    adata = adata[:,genes]

#### for this brain region or cell-type compute the differential expression info

In [None]:
%%time
results_df = diffexp_group(adata, tissue, tissue_type, verbose=True)
print(results_df.shape)
display(results_df.head())
print(results_df['tissue'].value_counts())

#### save the results

In [None]:
results_df.to_csv(results_file, index=False)

In [None]:
!date