## Notebook to run differential expression in single-cell data using LMM

based on some of the observations related to pseudo-replicate and zero-inflation from

this runs based on data input preps that were done for glmmTMB based analysis

[Zimmerman KD, Espeland MA, Langefeld CD. A practical solution to pseudoreplication bias in single-cell studies. Nat Commun 2021;12:738.](https://pubmed.ncbi.nlm.nih.gov/33531494/)


In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import seaborn as sns
import statsmodels.api as sm
from numba import jit

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

#### set notebook variables

In [None]:
# parameters
tissue_or_cell_name = 'ExN_THEMIS'
testing = True

In [None]:
# naming
cohort = 'aging'
assay = 'RNA'

# directories for initial setup
wrk_dir = '/home/jupyter/brain_aging_phase1'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'

# in files
in_file = f'{quants_dir}/{tissue_or_cell_name}_glmmtmb_in_df_temp.csv'

# out files
results_file = f'{results_dir}/{tissue_or_cell_name}.lmm_age_diffs.csv'

# constants
DEBUG = True
young_age_limit = 30.0
min_cell_count = 3
# get cell counts by 'categories'
region_obs_feature = 'Brain_region'
celltype_obs_feature = 'new_anno'
testing_sample_size = 100 #25
covariates = ['Brain_region', 'Age', 'Age_group', 'pool_name', 'Sample_id', 
              'Sex', 'donor_id', 'old', 'female']

#### analysis functions

In [None]:
print(dir(sm.families))

In [None]:
def mixed_model(formula: str, df: pd.DataFrame, group_name: str, tweedie: bool=True):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    if tweedie:
        # model.family = sm.families.Tweedie()
        model.family = sm.families.Tweedie(link=None, var_power=1.6, eql=True)
    result = model.fit()
    return result

@jit(nopython=True)
def compute_fold_change(intercept: float, coef: float) -> float:
    if coef > 0:
        fc = np.log2((intercept + coef)/intercept)
    else:
        fc = -np.log2(intercept/(intercept - abs(coef)))
    return fc

def glmm_diff_expr_age(df: pd.DataFrame, feature: str, verbose: bool=False) -> tuple:
    dep_term = feature
    indep_term = 'old'
    grouping = 'Sample_id'    
    this_formula = f'Q("{dep_term}") ~ {indep_term} + C(pool_name)'
    # just drop zeros 
    try:
        # run GLMM via statsmodel
        result = mixed_model(this_formula, df, grouping)
        fold_change = compute_fold_change(result.params['Intercept'], 
                                          result.params[indep_term])
        ret_list = [dep_term, result.params['Intercept'], 
                    result.params[indep_term], result.bse[indep_term], 
                    result.tvalues[indep_term], result.pvalues[indep_term], 
                    fold_change]
        if verbose:
            print(f'df shape {df.shape}')
            print(f'non-zero df shape {df.shape}')
            print(result.summary())
            print(['feature', 'intercept', 'coef', 'stderr', 'z', 'p-value', 'log2_fc'])
            print(ret_list)
    except:
#         print(f'Caught Error for {dep_term}')
        ret_list = [dep_term] + [np.nan] * 6
  
    return ret_list

def diff_exp_of_features(df: pd.DataFrame) -> list:
    results = []
    features = set(df.columns) - set(covariates)
    for feature in features:
        results.append(glmm_diff_expr_age(df, feature))         
    return results

def diffexp_group(data: ad.AnnData, group_name: str, 
                  type_name: str, min_cell_count: int=3, 
                  verbose: bool=False, use_prev_prep: bool=True) -> pd.DataFrame:
    if verbose:
        print(f'starting subset {group_name}')
    print('using previous prep to find features for lmm modeling')
    type_clean_df = read_csv(in_file, index_col=0)
    type_results = diff_exp_of_features(type_clean_df)
    results_df = pd.DataFrame(data=type_results, 
                              columns=['feature', 'intercept', 'coef', 
                                       'stderr', 'z', 'p-value', 'log2_fc'])
    results_df['tissue'] = group_name
    results_df['type'] = 'brain_region' if type_name == region_obs_feature else 'cell_type'       
    if verbose:
        print(f'done', end='. ')
    return results_df

#### read the modeling input data

In [None]:
%%time
input_df = read_csv(in_file, index_col=0)
print(f'shape of input df is {input_df.shape}')
if DEBUG:
    display(input_df.sample(5))

#### get the target features from the input

In [None]:
targets = list(set(input_df.columns) - set(covariates))
print(f'found {len(targets)} targets')
if DEBUG:
    display(targets[:5])
    display(targets[-5:])

### testing subset

In [None]:
testing_sample_size = 3
genes = random.sample(targets, testing_sample_size)
test_df = input_df[genes + covariates]
print(test_df.shape)
display(test_df.sample(5))

In [None]:
for gene in genes:
    print(gene)
    dep_term = gene
    indep_term = 'old'
    grouping = 'Sample_id'    
    this_formula = f'Q("{dep_term}") ~ {indep_term} + C(pool_name)'
    result = mixed_model(this_formula, test_df, grouping, tweedie=True)
    display(result.summary())

In [None]:
if testing:
    genes = random.sample(list(adata.var.index.values), testing_sample_size)
    adata = adata[:,genes]

#### for this brain region or cell-type compute the differential expression info

In [None]:
%%time
results_df = diffexp_group(adata, tissue, tissue_type, verbose=True)
print(results_df.shape)
display(results_df.head())
print(results_df['tissue'].value_counts())

#### save the results

In [None]:
results_df.to_csv(results_file, index=False)