## Notebook to prepping data for differential expression in single-cell data using GLMMTMB for single region or cell-type, use as template notebook for running each

Ran into parallelization, from notebook, and scaling issues in using glmmTMB.

So this notebook was split from glmmtmb_diffexp.ipynb to just run up thru prepping and formatting jobs. Then jobs can be run in parallel somewhere else. Then run another notebook to integrate and post-process the results.

based on some of the observations related to pseudo-replicate and zero-inflation from

[Zimmerman KD, Espeland MA, Langefeld CD. A practical solution to pseudoreplication bias in single-cell studies. Nat Commun 2021;12:738.](https://pubmed.ncbi.nlm.nih.gov/33531494/)


In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import diffxpy.api as de
import json

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

In [None]:
# parameters
region_celltype = ''
obs_type = ''
testing = False

In [None]:
# naming
cohort = 'aging'
assay = 'RNA'

# directories for initial setup
home_dir = '/labshare/raph/datasets/adrd_neuro'
wrk_dir = f'{home_dir}/{cohort}'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'

# in files
# in_file = f'{quants_dir}/{cohort}.pegasus.leiden_085.Age_group_young_old.h5ad'
in_file = f'{quants_dir}/{cohort}.pegasus.leiden_085.subclustered.h5ad'
glmmtmb_rscript_file = f'PATH/{cohort}/glmmTMB.R'

# out files
temp_r_in_file = '{this_dir}/{name}_glmmtmb_in_df_temp.csv'
temp_name_remap_json = '{this_dir}/{name}_gene_name_remap_temp.csv'
temp_r_out_file = '{this_dir}/{chrt}.{name}_glmmtmb_results_temp.csv'
diffxpy_out_file = '{this_dir}/{name}_de_diffxpy.csv'

# constants
min_cell_count = 3
# get cell counts by 'categories'
region_obs_feature = 'Brain_region'
celltype_obs_feature = 'new_anno'
testing_sample_size = 100 #25

#### analysis functions

In [None]:
def subset_ad_by_type(data: ad.AnnData, group_name: str, type_name: str,
                      reapply_filter: bool=True, min_cell_count: int=3,
                      verbose: bool=False) -> ad.AnnData:
    this_data = data[data.obs[type_name] == group_name].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'subset complete, shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data

def convert_ad_to_df(data: ad.AnnData, young_age_limit: float=30.0, 
                     verbose: bool=False) -> pd.DataFrame:
    data_df = data.to_df()
    annots = data.obs[['Brain_region', 'Age','Age_group', 'pool_name', 
                       'Sample_id', 'Sex', 'donor_id']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['female'] = np.where((annots['Sex'] == 'Female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = pd.concat([data_df, annots], axis='columns')
        if verbose:
            print(f'anndata to pandas df complete: {this_df.shape}')
            print(this_df.shape)
            display(this_df.head())
    return this_df

def feature_detected(feature: str=None, df: pd.DataFrame=None, 
                     min_cell_count: int=3, min_sample_det_rate: float=0.5,
                     verbose: bool=False) -> bool:
    nz_df = df.loc[df[feature] > 0]
    ok_cnts = nz_df['Sample_id'].value_counts() > min_cell_count
    ok_sample_cnt = ok_cnts[ok_cnts == True].shape[0]
    if ok_sample_cnt/df['Sample_id'].nunique() >= min_sample_det_rate:
        good_feature = True
    else:
        good_feature = False
    if verbose:
        print(feature, end=', ')
        print(f'nz_df.shape = {nz_df.shape}', end=', ')
        print(f'{ok_sample_cnt}/{df.Sample_id.nunique()}', end=', ')
        print(good_feature)
    return good_feature

def poorly_detected_features(features: list=None, df: pd.DataFrame=None, 
                             verbose=False) -> list:
    bad_features = []
    for feature in features:
        if not feature_detected(feature, df, verbose):
            bad_features.append(feature)
    if verbose:
        print(f'bad features counts is {len(bad_features)}')
    return bad_features

def non_de_features(data: ad.AnnData, group_name: str, alpha: float=0.05) -> list:
    de_tt = de.test.t_test(data=data, grouping='Age_group')
    de_tt_summary = de_tt.summary()
    de_tt_summary.to_csv(diffxpy_out_file.format(this_dir=f'{quants_dir}', 
                                                 name=group_name.replace(" ", "_")))
    ret_list = list(de_tt_summary[de_tt_summary['pval'] > alpha]['gene'].values)
    return ret_list

def save_df_for_glmmtmb_in_r(df: pd.DataFrame, group_name: str):
    # R doesn't like column names with hyphens in 
    # data frames when building formulas so replace temporarily
    # find features containing hyphen
    feats_w_hyphen = df.columns[df.columns.str.contains('-')]
    # make dictionary to do replace
    rename_cols = {x: x.replace('-', '_') for x in feats_w_hyphen}
    df = df.rename(columns=rename_cols)
    df.to_csv(temp_r_in_file.format(this_dir=f'{quants_dir}', 
                                    name=group_name.replace(" ", "_")))
    # save to gene remame dict
    json.dump(rename_cols, open(temp_name_remap_json.format(this_dir=f'{quants_dir}',
                                                     name=group_name.replace(" ", "_")), 'w' ))
    # what the corresponding read would be
#   rename_cols = json.load(open(temp_name_remap_json.format(this_dir=f'{wrk_dir}/expression',
#                                                      name=group_name.replace(" ", "_"))))

def frmt_glmmtmb_script_cmd(group_name: str) -> str:
    in_file = temp_r_in_file.format(this_dir=f'{quants_dir}', 
                                    name=group_name.replace(" ", "_"))
    out_file = temp_r_out_file.format(this_dir=f'{results_dir}', chrt=cohort, 
                                      name=group_name.replace(" ", "_"))
    this_cmd = f'Rscript {glmmtmb_rscript_file} {in_file} {out_file}'
    return this_cmd

def diff_exp_of_features(df: pd.DataFrame, features: set, group_name: str) -> str:
    cols_dict = save_df_for_glmmtmb_in_r(df, group_name)
    # save the genes that had to be 
    this_cmd = frmt_glmmtmb_script_cmd(group_name)
    return this_cmd

def diffexp_group(data: ad.AnnData, group_name: str, 
                  type_name: str, min_cell_count: int=3, 
                  verbose: bool=False) -> pd.DataFrame:
    if verbose:
        print(f'starting subset {group_name}')
    # subset anndata object by type_name (brain region or cell-type)
    type_ad = subset_ad_by_type(data, group_name, type_name)
    # now dealing with just cells in this region or cell-type so refilter genes
    sc.pp.filter_genes(type_ad, min_cells=min_cell_count)
    if verbose:
        print('converting anndata to pandas df')    
    type_df = convert_ad_to_df(type_ad)
    # find features poorly detected and don't include in analysis
    if verbose:
        print(f'finding poorly detected features from cells x features {type_df.shape}')    
    bad_features = poorly_detected_features(type_ad.var.index.values, type_df)
    type_clean_df = type_df.drop(columns=bad_features)
    keep_features = set(type_ad.var.index) & set(type_clean_df.columns)
    type_clean_ad = type_ad[:,list(keep_features)]
    # compute simple DE, so only running slower glmmTMB on possibles
    if verbose:
        print('running diffxpy t-test screen')    
    exclude_features = non_de_features(type_clean_ad, group_name)
    type_clean_df = type_clean_df.drop(columns=exclude_features)
    keep_features = set(type_clean_ad.var.index) & set(type_clean_df.columns)
    if verbose:
        print(f'formatting glmmTMB command for {len(keep_features)} features and {type_clean_df.shape[0]} cells')    
    this_cmd = diff_exp_of_features(type_clean_df, keep_features, group_name)
    if verbose:
        print(f'done', end='. ')
    return this_cmd

#### read the anndata (h5ad) file

In [None]:
%%time
adata = sc.read(in_file, cache=True)
print(adata)

#### take a look at the cell counts by cell type

In [None]:
# get cell counts by 'categories'
categories = ['Brain_region', 'new_anno']
for this_cat in categories:
    print(adata.obs[this_cat].value_counts())

#### get sample counts per age group by  counts by 'categories'

In [None]:
for this_cat in categories:
    print(adata.obs.groupby([this_cat,'Age_group'])['Sample_id'].nunique())

In [None]:
sc.pl.umap(adata, color=[celltype_obs_feature], legend_loc='on data')

##### find cell-types we won't use in analysis
remove them, and then refilter genes based on cell count

In [None]:
found_uncertain = [x for x in adata.obs[celltype_obs_feature].unique().to_list() 
                   if 'uncertain' in x] + ['Astrocyte-GFAP-Hi']
print(found_uncertain)
adata = adata[~adata.obs[celltype_obs_feature].isin(found_uncertain ), :]
sc.pp.filter_genes(adata, min_cells=min_cell_count)
adata

### if testing notebooks for debugging purpose subset the features

In [None]:
if testing:
    genes = random.sample(list(adata.var.index.values), testing_sample_size)
    adata = adata[:,genes]

#### compute/prep for the differential expression info

In [None]:
%%time
result_cmd = diffexp_group(adata, region_celltype, obs_type, verbose=True)

In [None]:
print(result_cmd)

In [None]:
!date