## Notebook to run differential expression in single-cell data using LMM

based on some of the observations related to pseudo-replicate and zero-inflation from

this runs based on data input preps that were done for glmmTMB based analysis

[Zimmerman KD, Espeland MA, Langefeld CD. A practical solution to pseudoreplication bias in single-cell studies. Nat Commun 2021;12:738.](https://pubmed.ncbi.nlm.nih.gov/33531494/)


In [1]:
!date

Fri Dec  8 17:27:08 UTC 2023


#### import libraries and set notebook variables

In [2]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import diffxpy.api as de
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import seaborn as sns
import statsmodels.api as sm
from numba import jit

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

In [4]:
# parameters
tissue_or_cell_name = 'ExN_THEMIS'
testing = True

In [6]:
# naming
cohort = 'aging'
assay = 'RNA'

# directories for initial setup
home_dir = '/labshare/raph/datasets/adrd_neuro'
wrk_dir = f'{home_dir}/{cohort}'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'

# in files
in_file = f'{quants_dir}/{tissue_or_cell_name}_glmmtmb_in_df_temp.csv'

# out files
results_file = f'{results_dir}/{tissue_or_cell_name}.lmm_age_diffs.csv'

# constants
young_age_limit = 30.0
min_cell_count = 3
# get cell counts by 'categories'
region_obs_feature = 'Brain_region'
celltype_obs_feature = 'new_anno'
testing_sample_size = 100 #25
covariates = ['Brain_region', 'Age', 'Age_group', 'pool_name', 'Sample_id', 
              'Sex', 'donor_id', 'old', 'female']

#### analysis functions

In [7]:
def mixed_model(formula: str, df: pd.DataFrame, group_name: str):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return result

@jit(nopython=True)
def compute_fold_change(intercept: float, coef: float) -> float:
    if coef > 0:
        fc = np.log2((intercept + coef)/intercept)
    else:
        fc = -np.log2(intercept/(intercept - abs(coef)))
    return fc

def glmm_diff_expr_age(df: pd.DataFrame, feature: str, verbose: bool=False) -> tuple:
    dep_term = feature
    indep_term = 'old'
    grouping = 'Sample_id'    
    this_formula = f'Q("{dep_term}") ~ {indep_term} + C(pool_name)'
    # just drop zeros 
    try:
        # run GLMM via statsmodel
        result = mixed_model(this_formula, df, grouping)
        fold_change = compute_fold_change(result.params['Intercept'], 
                                          result.params[indep_term])
        ret_list = [dep_term, result.params['Intercept'], 
                    result.params[indep_term], result.bse[indep_term], 
                    result.tvalues[indep_term], result.pvalues[indep_term], 
                    fold_change]
        if verbose:
            print(f'df shape {df.shape}')
            print(f'non-zero df shape {df.shape}')
            print(result.summary())
            print(['feature', 'intercept', 'coef', 'stderr', 'z', 'p-value', 'log2_fc'])
            print(ret_list)
    except:
#         print(f'Caught Error for {dep_term}')
        ret_list = [dep_term] + [np.nan] * 6
  
    return ret_list

def diff_exp_of_features(df: pd.DataFrame) -> list:
    results = []
    features = set(df.columns) - set(covariates)
    for feature in features:
        results.append(glmm_diff_expr_age(df, feature))         
    return results

def diffexp_group(data: ad.AnnData, group_name: str, 
                  type_name: str, min_cell_count: int=3, 
                  verbose: bool=False, use_prev_prep: bool=True) -> pd.DataFrame:
    if verbose:
        print(f'starting subset {group_name}')
    print('using previous prep to find features for lmm modeling')
    type_clean_df = read_csv(in_file, index_col=0)
    type_results = diff_exp_of_features(type_clean_df)
    results_df = pd.DataFrame(data=type_results, 
                              columns=['feature', 'intercept', 'coef', 
                                       'stderr', 'z', 'p-value', 'log2_fc'])
    results_df['tissue'] = group_name
    results_df['type'] = 'brain_region' if type_name == region_obs_feature else 'cell_type'       
    if verbose:
        print(f'done', end='. ')
    return results_df

#### read the anndata (h5ad) file

In [None]:
%%time
adata = sc.read(in_file, cache=True)

print(adata)

#### take a look at the cell counts by cell type

In [None]:
# get cell counts by 'categories'
categories = ['Brain_region', 'new_anno']
for this_cat in categories:
    print(adata.obs[this_cat].value_counts())

#### get sample counts per age group by  counts by 'categories'

In [None]:
for this_cat in categories:
    print(adata.obs.groupby([this_cat,'Age_group'])['Sample_id'].nunique())

In [None]:
# sc.pl.umap(adata, color=[celltype_obs_feature], legend_loc='on data')
with rc_context({'figure.figsize': (12, 12)}):
    sc.pl.umap(adata, color=[celltype_obs_feature], legend_loc='on data', 
               add_outline=True, legend_fontsize=10)

##### find cell-types we won't use in analysis
remove them, and then refilter genes based on cell count

In [None]:
found_uncertain = [x for x in adata.obs[celltype_obs_feature].unique().to_list() 
                   if 'uncertain' in x] + ['Astrocyte-GFAP-Hi']
print(found_uncertain)
adata = adata[~adata.obs[celltype_obs_feature].isin(found_uncertain ), :]
sc.pp.filter_genes(adata, min_cells=min_cell_count)
adata

### if testing notebooks for debugging purpose subset the features

In [None]:
if testing:
    genes = random.sample(list(adata.var.index.values), testing_sample_size)
    adata = adata[:,genes]

#### for this brain region or cell-type compute the differential expression info

In [None]:
%%time
results_df = diffexp_group(adata, tissue, tissue_type, verbose=True)
print(results_df.shape)
display(results_df.head())
print(results_df['tissue'].value_counts())

#### save the results

In [None]:
results_df.to_csv(results_file, index=False)