## Notebook to prepare the input files from the replication data for analysis with glmmTMB

In [None]:
!date

#### import libraries

In [None]:
import scanpy as sc
from os.path import exists
from pandas import DataFrame, concat, read_csv
from anndata import AnnData
import numpy as np
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
import json
from statsmodels.stats.multitest import multipletests
from polars import read_csv as pl_read_csv
from sklearn.preprocessing import MinMaxScaler
from multiprocessing import Process

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
project = 'aging_phase1'
set_name = f'{project}_replication'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
replication_dir = f'{wrk_dir}/replication'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'
sc.settings.figdir = f'{figures_dir}/'

# in files
anndata_file = f'{replication_dir}/{set_name}.scvi.h5ad'
temp_name_remap_json = '{this_dir}/{name}_gene_name_remap_temp.csv'
temp_r_out_file = '{this_dir}/aging.{name}_glmmtmb_results_temp.csv'

# out files
temp_r_in_file = '{this_dir}/{name}_glmmtmb_in_df_temp.csv'


# variables
DEBUG = True
REGIONS = ['Middle_temporal_gyrus', 'Putamen', 
           'Entorhinal_cortex', 'Subventricular_zone']
CELLTYPES=['ExN', 'Oligodendrocyte', 'Astrocyte', 'InN', 'OPC', 'Mural', 
           'Microglia', 'SPN', 'Endothelial', 'Ependymal']
cell_abbr_mappings = {'ExN': 'ExN', 'Oligodendrocyte': 'Oligo', 'Astrocyte': 'Astro', 
                      'InN': 'InN', 'OPC': 'OPC', 'Microglia': 'Micro', 'Endothelial': 'Endo'}
MAX_ALPHA = 0.05
SCVI_NORMALIZED_KEY = 'scvi_normalized'

#### functions

In [None]:
def read_feature_renamed_map(cell_name: str) -> dict:
    # read dict from json file
    rename_cols = json.load(open(temp_name_remap_json.format(this_dir=quants_dir,
                                                             name=cell_name.replace(" ", "_"))))
    return rename_cols

def reformat_glmmtmb_df(df: DataFrame) -> DataFrame:
    # reformat results into one row per feature
    temp_term = df.loc[df['term'] == 'old'].copy()
    temp_intercepts = df.loc[df['term'] == '(Intercept)', ['feature', 'estimate']].copy()
    temp_intercepts = temp_intercepts.rename(columns={'estimate': 'intercept'})
    this_df = temp_term.merge(temp_intercepts, how='inner', on='feature')
    return this_df[['feature', 'intercept', 'estimate', 'std.error', 'statistic', 'p.value']]

def read_glmmtmb_results(cell_name: str, group_type: str, cols_to_rename: dict) -> DataFrame:
    this_file = temp_r_out_file.format(this_dir=f'{results_dir}',
                                       name=cell_name.replace(" ", "_"))
    this_df = read_csv(this_file)
    # need to flip the features with '-' -> '_' for R back to originals
    # the the key/values
    rename_cols = {value: key for (key, value) in cols_to_rename.items()}
    this_df['feature'] = this_df['feature'].replace(rename_cols)
    this_df = reformat_glmmtmb_df(this_df)
    this_df['tissue'] = cell_name
    this_df['type'] = group_type     
    return this_df

def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p.value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

def subset_anndata(data: AnnData, cell_name: str, features: list, reapply_filter: bool=True, 
                   min_cell_count: int=3, verbose: bool=False) -> AnnData:
    this_data = data[(data.obs.Cell_type == cell_name),features].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'subset complete, shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data

def scale_dataframe(this_df : DataFrame):
    scaledX = MinMaxScaler().fit_transform(this_df)
    scaled_df = DataFrame(data=scaledX, columns=this_df.columns, 
                          index=this_df.index) 
    return scaled_df

def convert_ad_to_df(data: AnnData, young_age_limit: float=30.0, 
                     scale: bool=True, verbose: bool=False) -> DataFrame:
    data_df = data.to_df(SCVI_NORMALIZED_KEY)
    if scale:
        data_df = scale_dataframe(data_df)       
    annots = data.obs[['Sample_ID', 'Age','Sex']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['female'] = np.where((annots['Sex'] == 'Female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = concat([data_df, annots], axis='columns')
        this_df.index.name = 'barcodekey'
        if verbose:
            print(f'anndata to pandas df complete: {this_df.shape}')
            print(this_df.shape)
            display(this_df.head())
    return this_df

def feature_detected(feature_col, features: list=None, df: DataFrame=None, 
                     min_cell_count: int=3, min_sample_det_rate: float=0.5,
                     verbose: bool=False):    
    good_feature = True
    if feature_col.name in features:
        nz_df = feature_col[feature_col > 0]
        ok_cnts = df.loc[nz_df.index].Sample_ID.value_counts() > min_cell_count
        ok_sample_cnt = ok_cnts.sum()
        unique_sample_id_count = df.Sample_ID.nunique()
        good_feature = ok_sample_cnt / unique_sample_id_count >= min_sample_det_rate
        if verbose:
            print(feature_col.name, end=', ')
            print(f'nz_df.shape = {nz_df.shape}', end=', ')
            print(f'{ok_sample_cnt}/{unique_sample_id_count}', end=', ')
            print(good_feature)
    return good_feature

def poorly_detected_features(features: list=None, df: DataFrame=None, 
                             verbose=False) -> list:
    feature_detect_df = df.apply(feature_detected, features=features, df=df)
    bad_features = feature_detect_df.loc[~feature_detect_df].index.to_list()
    if verbose:
        print(f'bad features counts is {len(bad_features)}')
    return bad_features

def save_df_for_glmmtmb_in_r(df: DataFrame, cell_name: str):
    # R doesn't like column names with hyphens in 
    # data frames when building formulas so replace temporarily
    # find features containing hyphen
    feats_w_hyphen = df.columns[df.columns.str.contains('-')]
    # make dictionary to do replace
    rename_cols = {x: x.replace('-', '_') for x in feats_w_hyphen}
    df = df.rename(columns=rename_cols)
    df.to_csv(temp_r_in_file.format(this_dir=f'{replication_dir}', 
                                    name=cell_name.replace(" ", "_")))
    # save to gene remame dict
    json.dump(rename_cols, 
              open(temp_name_remap_json.format(this_dir=f'{replication_dir}',
                                               name=cell_name.replace(" ", "_")), 'w'))

def diffexp_group(data: AnnData, cell_name: str,
                  min_cell_count: int=3, 
                  verbose: bool=False) -> str:
    type_ad = data
    if verbose:
        print('converting anndata to pandas df')    
    type_df = convert_ad_to_df(data)
    # find features poorly detected and don't include in analysis
    if verbose:
        print(f'finding poorly detected features from cells x features {type_df.shape}')    
    bad_features = poorly_detected_features(data.var.index.values, type_df)
    type_clean_df = type_df.drop(columns=bad_features)
    keep_features = set(data.var.index) & set(type_clean_df.columns)
    if verbose:
        print(f'formatting glmmTMB command for {len(keep_features)} features and {type_clean_df.shape[0]} cells')    
    this_cmd = save_df_for_glmmtmb_in_r(type_clean_df, cell_name)
    print(f'\ndone with {cell_name} kept {len(keep_features)} features and {type_clean_df.shape[0]} cells')
    # if verbose:
    #     print(f'done', end='. ')
    return this_cmd

def diffexp_group_wrapper(data: AnnData, cell_name: str):
    diffexp_group(data, cell_name)

### load data

#### load replication data

In [None]:
%%time
adata = sc.read(anndata_file, cache=True)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

In [None]:
adata.obs.Sex.value_counts()

In [None]:
display(adata.obs.Cell_type.value_counts())

In [None]:
display(adata.obs.Sample_ID.value_counts())

#### load discovery results
use this to determine which features in what cell-types need to be analyzed

In [None]:
glmmtmb_results = None
for region in REGIONS:
    for cell_type in cell_abbr_mappings.keys():
        print(region, cell_type)
        in_results_file = f'{results_dir}/aging.{region}_{cell_type}_glmmtmb_results_temp.csv'
        if exists(in_results_file):
            this_tissue = f'{region}_{cell_type}'
            renamed_features = read_feature_renamed_map(this_tissue)
            glmmtmb_results = concat([glmmtmb_results, 
                                      read_glmmtmb_results(this_tissue, 'region_broad_type',
                                                           renamed_features)])
            print('Done.')
        else:
            print('Not found, skipping.')        

### compute the FDR values

In [None]:
glmmtmb_results['p.value'] = glmmtmb_results['p.value'].fillna(1)
glmmtmb_results = compute_bh_fdr(glmmtmb_results)
print(f'results shape is {glmmtmb_results.shape}')
if DEBUG:
    display(glmmtmb_results.sort_values('fdr_bh').head(10))

#### how many are 'nominally' significant

In [None]:
nominal_df = glmmtmb_results.loc[glmmtmb_results['p.value'] < MAX_ALPHA] 
print(nominal_df.shape)
if DEBUG:
    # see bottom of nominally significant
    display(nominal_df.sort_values('p.value').tail(10))

#### count of significant genes by brain region cell type

In [None]:
glmmtmb_results.loc[glmmtmb_results['fdr_bh'] < 0.05].tissue.value_counts()

In [None]:
nominal_df.tissue.value_counts()

### format the data for input to glmmTMB
based on nominally significant results from discovery glmmTMB analysis

In [None]:
# for disc_cell_type, rep_cell_type in cell_abbr_mappings.items():
#     cell_names = [f'{region}_{disc_cell_type}' for region in REGIONS]
#     cell_name = f'Frontal_cortex_{rep_cell_type}'
#     # subset the adata by cell-type and features
#     features = nominal_df.loc[nominal_df.tissue.isin(cell_names)].feature.unique()
#     if len(features) > 0:
#         print(f'--- processing {cell_name} in parallel')
#         features = list(set(features) & set(adata.var.index))
#         adata_sub = subset_anndata(adata, rep_cell_type, features)
#         diffexp_group(adata_sub, cell_name)

based on all testable input for the discovery glmmTMB analysis

In [None]:
%%time

cmds = {}
for disc_cell_type, rep_cell_type in cell_abbr_mappings.items():
    cell_names = [f'{region}_{disc_cell_type}' for region in REGIONS]
    features = {}
    for cell_name in cell_names:
        in_file = temp_r_in_file.format(this_dir=quants_dir, name=cell_name)
        glmm_in_df = pl_read_csv(in_file)
        features = set(features) | set(glmm_in_df.columns)
    # need to deal with hyphen to underscore revert
    features = [st.replace('_', '-') for st in features]
    features = list(set(features) & set(adata.var.index))
    print(f'{rep_cell_type} might test {len(features)} features')
    adata_sub = subset_anndata(adata, rep_cell_type, features)
    cell_name = f'Frontal_cortex_{rep_cell_type}'
    p = Process(target=diffexp_group_wrapper,args=(adata_sub, cell_name))
    p.start()
    # Append process and key to keep track
    cmds[rep_cell_type] = p    
    # diffexp_group(adata_sub, cell_name)
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()    

In [None]:
!date