## Notebook to run differential expression in single-cell data using GLMMTMB for single region or cell-type, use as template notebook for running each

based on some of the observations related to pseudo-replicate and zero-inflation from

[Zimmerman KD, Espeland MA, Langefeld CD. A practical solution to pseudoreplication bias in single-cell studies. Nat Commun 2021;12:738.](https://pubmed.ncbi.nlm.nih.gov/33531494/)


In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import diffxpy.api as de
import subprocess
import os

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

In [None]:
# parameters
region_celltype = ''
obs_type = ''
testing = False

In [None]:
# naming
cohort = 'aging'
assay = 'RNA'

# directories for initial setup
home_dir = '/labshare/raph/datasets/adrd_neuro'
wrk_dir = f'{home_dir}/{cohort}'
results_dir = f'{wrk_dir}/demux'

# in files
in_file = f'{results_dir}/{cohort}.pegasus.leiden_085.Age_group_young_old.h5ad'
glmmtmb_rscript_file = f'/home/gibbsr/notebooks/expression/adrd_neuro/{cohort}/glmmTMB.R'

# out files
out_file = f'{results_dir}/{cohort}.{region_celltype.replace(" ", "_")}.glmmtmb_age_diffs.csv'
temp_r_in_file = '{this_dir}/{name}_glmmtmb_in_df_temp.csv'
temp_r_out_file = '{this_dir}/{chrt}.{name}_glmmtmb_results_temp.csv'

# constants
min_cell_count = 3
region_obs_feature = 'Brain_region'
celltype_obs_feature = 'new_anno'
testing_sample_size = 100 #25

#### analysis functions

In [None]:
def subset_ad_by_type(data: ad.AnnData, group_name: str, type_name: str,
                      reapply_filter: bool=True, min_cell_count: int=3,
                      verbose: bool=False) -> ad.AnnData:
    this_data = data[data.obs[type_name] == group_name].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data

def convert_ad_to_df(data: ad.AnnData, young_age_limit: float=30.0, 
                     verbose: bool=False) -> pd.DataFrame:
    data_df = data.to_df()
    annots = data.obs[['Brain_region', 'Age','Age_group', 'pool_name', 
                       'Sample_id', 'Sex', 'donor_id']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['female'] = np.where((annots['Sex'] == 'Female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = pd.concat([data_df, annots], axis='columns')
        if verbose:
            print(this_df.shape)
            display(this_df.head())
    return this_df

def feature_detected(feature: str=None, df: pd.DataFrame=None, 
                     min_cell_count: int=3, min_sample_det_rate: float=0.5,
                     verbose: bool=False) -> bool:
    nz_df = df.loc[df[feature] > 0]
    ok_cnts = nz_df['Sample_id'].value_counts() > min_cell_count
    ok_sample_cnt = ok_cnts[ok_cnts == True].shape[0]
    if ok_sample_cnt/df['Sample_id'].nunique() >= min_sample_det_rate:
        good_feature = True
    else:
        good_feature = False
    if verbose:
        print(feature, end=', ')
        print(f'nz_df.shape = {nz_df.shape}', end=', ')
        print(f'{ok_sample_cnt}/{df.Sample_id.nunique()}', end=', ')
        print(good_feature)
    return good_feature

def poorly_detected_features(features: list=None, df: pd.DataFrame=None, 
                             verbose=False) -> list:
    bad_features = []
    for feature in features:
        if not feature_detected(feature, df, verbose):
            bad_features.append(feature)
    if verbose:
        print(f'bad features counts is {len(bad_features)}')
    return bad_features

def non_de_features(data: ad.AnnData=None, alpha: float=0.05) -> list:
    de_tt = de.test.t_test(data=data, grouping='Age_group')
    de_tt_summary = de_tt.summary()
    ret_list = list(de_tt_summary[de_tt_summary['pval'] > alpha]['gene'].values)
    return ret_list

def save_df_for_glmmtmb_in_r(df: pd.DataFrame, group_name: str) -> dict:
    # R doesn't like column names with hyphens in 
    # data frames when building formulas so replace temporarily
    # find features containing hyphen
    feats_w_hyphen = df.columns[df.columns.str.contains('-')]
    # make dictionary to do replace
    rename_cols = {x: x.replace('-', '_') for x in feats_w_hyphen}
    df = df.rename(columns=rename_cols)
    df.to_csv(temp_r_in_file.format(this_dir=f'{wrk_dir}/expression', 
                                    name=group_name.replace(" ", "_")))
    return rename_cols

def frmt_glmmtmb_script_cmd(group_name: str) -> str:
    in_file = temp_r_in_file.format(this_dir=f'{wrk_dir}/expression', 
                                    name=group_name.replace(" ", "_"))
    out_file = temp_r_out_file.format(this_dir=f'{results_dir}', chrt=cohort, 
                                      name=group_name.replace(" ", "_"))
    this_cmd = f'Rscript {glmmtmb_rscript_file} {in_file} {out_file}'
    return this_cmd

def read_glmmtmb_results(group_name: str, cols_to_rename: dict) -> pd.DataFrame:
    this_file = temp_r_out_file.format(this_dir=f'{results_dir}', chrt=cohort, 
                                       name=group_name.replace(" ", "_"))
    this_df = pd.read_csv(this_file)
    # need to flip the features with '-' -> '_' for R back to originals
    # the the key/values
    rename_cols = {value: key for (key, value) in cols_to_rename.items()}
    this_df['feature'] = this_df['feature'].replace(rename_cols)
    return this_df

def diff_exp_of_features(df: pd.DataFrame, features: set, group_name: str, 
                         verbose: bool=False) -> pd.DataFrame:
    cols_dict = save_df_for_glmmtmb_in_r(df, group_name)
    this_cmd = frmt_glmmtmb_script_cmd(group_name)
    # run the cmd
    ret_out = subprocess.run(this_cmd.split(), capture_output=True)
    this_df = read_glmmtmb_results(group_name, cols_dict)
    # delete temp files
    os.remove(temp_r_in_file.format(this_dir=f'{wrk_dir}/expression', 
                                    name=group_name.replace(" ", "_")))
    os.remove(temp_r_out_file.format(this_dir=f'{results_dir}', chrt=cohort, 
                                     name=group_name.replace(" ", "_")))
    if verbose:
        print(f'df shape is {this_df.shape}')
        print(f'subprocess returned {ret_out}')
    return this_df

def diffexp_group(data: ad.AnnData, group_name: str, 
                  type_name: str, min_cell_count: int=3, 
                  verbose: bool=False) -> pd.DataFrame:
    if verbose:
        print(f'starting {group_name}', end='...')
    # subset anndata object by type_name (brain region or cell-type)
    type_ad = subset_ad_by_type(data, group_name, type_name)
    # now dealing with just cells in this region or cell-type so refilter genes
    sc.pp.filter_genes(type_ad, min_cells=min_cell_count)
    type_df = convert_ad_to_df(type_ad)
    # find features poorly detected and don't include in analysis
    bad_features = poorly_detected_features(type_ad.var.index.values, type_df)
    type_clean_df = type_df.drop(columns=bad_features)
    keep_features = set(type_ad.var.index) & set(type_clean_df.columns)
    type_clean_ad = type_ad[:,list(keep_features)]
    # compute simple DE, so only running slower glmmTMB on possibles
    exclude_features = non_de_features(type_clean_ad)
    type_clean_df = type_clean_df.drop(columns=exclude_features)
    keep_features = set(type_clean_ad.var.index) & set(type_clean_df.columns)
    results_df = diff_exp_of_features(type_clean_df, keep_features, group_name)
    results_df['group'] = group_name
    if verbose:
        print(f'done', end='. ')
    return results_df

#### read the anndata (h5ad) file

In [None]:
%%time
adata = sc.read(in_file)
print(adata)

#### take a look at the cell counts by brain region and cell-type

In [None]:
adata.obs[region_obs_feature].value_counts()

In [None]:
adata.obs[celltype_obs_feature].value_counts()

In [None]:
sc.pl.umap(adata, color=[celltype_obs_feature], legend_loc='on data')

##### find cell-types we won't use in analysis
remove them, and then refilter genes based on cell count

In [None]:
found_uncertain = [x for x in adata.obs[celltype_obs_feature].unique().to_list() 
                   if 'uncertain' in x]
print(found_uncertain)
adata = adata[~adata.obs[celltype_obs_feature].isin(found_uncertain), :]
sc.pp.filter_genes(adata, min_cells=min_cell_count)
adata

### if testing notebooks for debugging purpose subset the features

In [None]:
if testing:
    genes = random.sample(list(adata.var.index.values), testing_sample_size)
    adata = adata[:,genes]

#### compute the differential expression info

In [None]:
%%time
results_df = diffexp_group(adata, region_celltype, obs_type, verbose=True)

In [None]:
display(results_df)

In [None]:
alpha = 0.05
print(results_df.loc[(results_df['term'] == 'old') & (results_df['p.value'] <= alpha)].shape)
display(results_df.loc[(results_df['term'] == 'old') & (results_df['p.value'] <= alpha)])

#### save the results

In [None]:
results_df.to_csv(out_file, index=False)

In [None]:
!date