## Notebook to prepare the input files from the replication data for analysis with glmmTMB

In [1]:
!date

Thu Dec 14 14:24:43 UTC 2023


#### import libraries

In [2]:
import scanpy as sc
from os.path import exists
from pandas import DataFrame, concat, read_csv
from anndata import AnnData
import numpy as np
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
import json
from statsmodels.stats.multitest import multipletests
from polars import read_csv as pl_read_csv
from multiprocessing import Process

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [3]:
# naming
project = 'aging_phase1'
set_name = f'{project}_replication'

# directories
wrk_dir = '/home/jupyter/brain_aging_phase1'
replication_dir = f'{wrk_dir}/replication'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'
sc.settings.figdir = f'{figures_dir}/'

# in files
anndata_file = f'{replication_dir}/{set_name}.full.h5ad'
temp_name_remap_json = '{this_dir}/{name}_gene_name_remap_temp.csv'
temp_r_out_file = '{this_dir}/aging.{name}_glmmtmb_results_temp.csv'

# out files
temp_r_in_file = '{this_dir}/{name}_glmmtmb_in_df_temp.csv'


# variables
DEBUG = True
REGIONS = ['Middle_temporal_gyrus', 'Putamen', 
           'Entorhinal_cortex', 'Subventricular_zone']
CELLTYPES=['ExN', 'Oligodendrocyte', 'Astrocyte', 'InN', 'OPC', 'Mural', 
           'Microglia', 'SPN', 'Endothelial', 'Ependymal']
cell_abbr_mappings = {'ExN': 'ExN', 'Oligodendrocyte': 'Oligo', 'Astrocyte': 'Astro', 
                      'InN': 'InN', 'OPC': 'OPC', 'Microglia': 'Micro', 'Endothelial': 'Endo'}
MAX_ALPHA = 0.05

#### functions

In [4]:
def read_feature_renamed_map(cell_name: str) -> dict:
    # read dict from json file
    rename_cols = json.load(open(temp_name_remap_json.format(this_dir=quants_dir,
                                                             name=cell_name.replace(" ", "_"))))
    return rename_cols

def reformat_glmmtmb_df(df: DataFrame) -> DataFrame:
    # reformat results into one row per feature
    temp_term = df.loc[df['term'] == 'old'].copy()
    temp_intercepts = df.loc[df['term'] == '(Intercept)', ['feature', 'estimate']].copy()
    temp_intercepts = temp_intercepts.rename(columns={'estimate': 'intercept'})
    this_df = temp_term.merge(temp_intercepts, how='inner', on='feature')
    return this_df[['feature', 'intercept', 'estimate', 'std.error', 'statistic', 'p.value']]

def read_glmmtmb_results(cell_name: str, group_type: str, cols_to_rename: dict) -> DataFrame:
    this_file = temp_r_out_file.format(this_dir=f'{results_dir}',
                                       name=cell_name.replace(" ", "_"))
    this_df = read_csv(this_file)
    # need to flip the features with '-' -> '_' for R back to originals
    # the the key/values
    rename_cols = {value: key for (key, value) in cols_to_rename.items()}
    this_df['feature'] = this_df['feature'].replace(rename_cols)
    this_df = reformat_glmmtmb_df(this_df)
    this_df['tissue'] = cell_name
    this_df['type'] = group_type     
    return this_df

def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p.value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

def subset_anndata(data: AnnData, cell_name: str, features: list, reapply_filter: bool=True, 
                   min_cell_count: int=3, verbose: bool=False) -> AnnData:
    this_data = data[(data.obs.Cell_type == cell_name),features].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'subset complete, shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data

def convert_ad_to_df(data: AnnData, young_age_limit: float=30.0, 
                     verbose: bool=False) -> DataFrame:
    data_df = data.to_df()
    annots = data.obs[['Sample_ID', 'Age','Sex']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['female'] = np.where((annots['Sex'] == 'Female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = concat([data_df, annots], axis='columns')
        this_df.index.name = 'barcodekey'
        if verbose:
            print(f'anndata to pandas df complete: {this_df.shape}')
            print(this_df.shape)
            display(this_df.head())
    return this_df

def feature_detected(feature_col, features: list=None, df: DataFrame=None, 
                     min_cell_count: int=3, min_sample_det_rate: float=0.5,
                     verbose: bool=False):    
    good_feature = True
    if feature_col.name in features:
        nz_df = feature_col[feature_col > 0]
        ok_cnts = df.loc[nz_df.index].Sample_ID.value_counts() > min_cell_count
        ok_sample_cnt = ok_cnts.sum()
        unique_sample_id_count = df.Sample_ID.nunique()
        good_feature = ok_sample_cnt / unique_sample_id_count >= min_sample_det_rate
        if verbose:
            print(feature_col.name, end=', ')
            print(f'nz_df.shape = {nz_df.shape}', end=', ')
            print(f'{ok_sample_cnt}/{unique_sample_id_count}', end=', ')
            print(good_feature)
    return good_feature

def poorly_detected_features(features: list=None, df: DataFrame=None, 
                             verbose=False) -> list:
    feature_detect_df = df.apply(feature_detected, features=features, df=df)
    bad_features = feature_detect_df.loc[~feature_detect_df].index.to_list()
    if verbose:
        print(f'bad features counts is {len(bad_features)}')
    return bad_features

def save_df_for_glmmtmb_in_r(df: DataFrame, cell_name: str):
    # R doesn't like column names with hyphens in 
    # data frames when building formulas so replace temporarily
    # find features containing hyphen
    feats_w_hyphen = df.columns[df.columns.str.contains('-')]
    # make dictionary to do replace
    rename_cols = {x: x.replace('-', '_') for x in feats_w_hyphen}
    df = df.rename(columns=rename_cols)
    df.to_csv(temp_r_in_file.format(this_dir=f'{replication_dir}', 
                                    name=cell_name.replace(" ", "_")))
    # save to gene remame dict
    json.dump(rename_cols, 
              open(temp_name_remap_json.format(this_dir=f'{replication_dir}',
                                               name=cell_name.replace(" ", "_")), 'w'))

def diffexp_group(data: AnnData, cell_name: str,
                  min_cell_count: int=3, 
                  verbose: bool=False) -> str:
    type_ad = data
    if verbose:
        print('converting anndata to pandas df')    
    type_df = convert_ad_to_df(data)
    # find features poorly detected and don't include in analysis
    if verbose:
        print(f'finding poorly detected features from cells x features {type_df.shape}')    
    bad_features = poorly_detected_features(data.var.index.values, type_df)
    type_clean_df = type_df.drop(columns=bad_features)
    keep_features = set(data.var.index) & set(type_clean_df.columns)
    if verbose:
        print(f'formatting glmmTMB command for {len(keep_features)} features and {type_clean_df.shape[0]} cells')    
    this_cmd = save_df_for_glmmtmb_in_r(type_clean_df, cell_name)
    print(f'\ndone with {cell_name} kept {len(keep_features)} features and {type_clean_df.shape[0]} cells')
    # if verbose:
    #     print(f'done', end='. ')
    return this_cmd

def diffexp_group_wrapper(data: AnnData, cell_name: str):
    diffexp_group(data, cell_name)

### load data

#### load replication data

In [5]:
%%time
adata = sc.read(anndata_file, cache=True)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

AnnData object with n_obs × n_vars = 74291 × 21374
    obs: 'Sample_ID', 'Sex', 'Age', 'Barcode', 'Study', 'Study_type', 'Batch', 'Cluster', 'Cell_type', 'Brain_region', 'leiden_scVI'
    var: 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'Cell_type_colors', 'Cluster_colors', 'hvg', 'leiden', 'leiden_scVI_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scVI', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'


Unnamed: 0,Sample_ID,Sex,Age,Barcode,Study,Study_type,Batch,Cluster,Cell_type,Brain_region,leiden_scVI
AGCTAGTTCTTTGACT-1,UMARY-4546,female,86.0,AGCTAGTTCTTTGACT-1,Reed,replication,,SFG:Exc.3,ExN,frontal cortex,2
CGTAATGGTTGCCTCA-1,UMARY-5088,male,66.0,CGTAATGGTTGCCTCA-1,Reed,replication,,SFG:Oligo.2,Oligo,frontal cortex,0
TGCGCGAGTGTAATAC-1,UMARY-1541,female,20.0,TGCGCGAGTGTAATAC-1,Reed,replication,,SFG:Micro,Micro,frontal cortex,7
TACGTACAGCGGATAA-1,UMARY-5088,male,66.0,TACGTACAGCGGATAA-1,Reed,replication,,SFG:Oligo.2,Oligo,frontal cortex,0
GAGGCTTGTACCAGGT-1,UMARY-1541,female,20.0,GAGGCTTGTACCAGGT-1,Reed,replication,,SFG:Oligo.2,Oligo,frontal cortex,0


CPU times: user 544 ms, sys: 3.28 s, total: 3.82 s
Wall time: 3.82 s


In [6]:
adata.obs.Sex.value_counts()

male      46203
female    28088
Name: Sex, dtype: int64

In [7]:
display(adata.obs.Cell_type.value_counts())

Oligo    29091
ExN      22157
InN       7663
Astro     6645
Micro     4098
OPC       3154
Endo      1478
Name: Cell_type, dtype: int64

In [16]:
display(adata.obs.Sample_ID.value_counts())

UMARY-5088    11236
UMARY-5171     8568
UMARY-1540     8475
UMARY-1710     8157
UMARY-4789     7266
UMARY-1541     7130
UMARY-4546     5535
UMARY-1818     4823
UMARY-602      4694
UMARY-5123     4025
UMARY-4726     2936
UMARY-5028     1446
Name: Sample_ID, dtype: int64

#### load discovery results
use this to determine which features in what cell-types need to be analyzed

In [8]:
glmmtmb_results = None
for region in REGIONS:
    for cell_type in cell_abbr_mappings.keys():
        print(region, cell_type)
        in_results_file = f'{results_dir}/aging.{region}_{cell_type}_glmmtmb_results_temp.csv'
        if exists(in_results_file):
            this_tissue = f'{region}_{cell_type}'
            renamed_features = read_feature_renamed_map(this_tissue)
            glmmtmb_results = concat([glmmtmb_results, 
                                      read_glmmtmb_results(this_tissue, 'region_broad_type',
                                                           renamed_features)])
            print('Done.')
        else:
            print('Not found, skipping.')        

Middle_temporal_gyrus ExN
Not found, skipping.
Middle_temporal_gyrus Oligodendrocyte
Not found, skipping.
Middle_temporal_gyrus Astrocyte
Not found, skipping.
Middle_temporal_gyrus InN
Not found, skipping.
Middle_temporal_gyrus OPC
Not found, skipping.
Middle_temporal_gyrus Microglia
Done.
Middle_temporal_gyrus Endothelial
Done.
Putamen ExN
Not found, skipping.
Putamen Oligodendrocyte
Not found, skipping.
Putamen Astrocyte
Not found, skipping.
Putamen InN
Not found, skipping.
Putamen OPC
Not found, skipping.
Putamen Microglia
Not found, skipping.
Putamen Endothelial
Done.
Entorhinal_cortex ExN
Not found, skipping.
Entorhinal_cortex Oligodendrocyte
Not found, skipping.
Entorhinal_cortex Astrocyte
Not found, skipping.
Entorhinal_cortex InN
Not found, skipping.
Entorhinal_cortex OPC
Not found, skipping.
Entorhinal_cortex Microglia
Not found, skipping.
Entorhinal_cortex Endothelial
Done.
Subventricular_zone ExN
Done.
Subventricular_zone Oligodendrocyte
Not found, skipping.
Subventricular_z

### compute the FDR values

In [9]:
glmmtmb_results['p.value'] = glmmtmb_results['p.value'].fillna(1)
glmmtmb_results = compute_bh_fdr(glmmtmb_results)
print(f'results shape is {glmmtmb_results.shape}')
if DEBUG:
    display(glmmtmb_results.sort_values('fdr_bh').head(10))

total significant after correction: (14, 9)
results shape is (2023, 9)


Unnamed: 0,feature,intercept,estimate,std.error,statistic,p.value,tissue,type,fdr_bh
76,SYT1,1.615722,0.109562,0.021619,5.067887,4.022561e-07,Subventricular_zone_ExN,region_broad_type,0.000814
17,SLC4A10,1.397294,0.123548,0.02912,4.242747,2.208001e-05,Subventricular_zone_ExN,region_broad_type,0.018812
57,SPTBN1,1.533858,-0.333345,0.079908,-4.171609,3.024566e-05,Subventricular_zone_Endothelial,region_broad_type,0.018812
488,C22orf34,-0.939647,1.25274,0.30375,4.124244,3.719552e-05,Middle_temporal_gyrus_Microglia,region_broad_type,0.018812
412,XAF1,-1.128369,1.334865,0.32781,4.072065,4.659808e-05,Middle_temporal_gyrus_Microglia,region_broad_type,0.018854
255,TUSC3,0.330063,-0.882462,0.22033,-4.005178,6.197073e-05,Entorhinal_cortex_Endothelial,region_broad_type,0.020894
85,SNHG14,1.775962,0.062358,0.016047,3.885843,0.0001019753,Subventricular_zone_ExN,region_broad_type,0.022922
478,MORC3,-0.394559,0.886503,0.226021,3.922224,8.773553e-05,Middle_temporal_gyrus_Microglia,region_broad_type,0.022922
410,FBXO34,0.72123,-0.628637,0.160861,-3.907961,9.307833e-05,Entorhinal_cortex_Endothelial,region_broad_type,0.022922
75,NAV3,1.600776,0.098173,0.025709,3.818672,0.0001341718,Subventricular_zone_ExN,region_broad_type,0.027143


#### how many are 'nominally' significant

In [10]:
nominal_df = glmmtmb_results.loc[glmmtmb_results['p.value'] < MAX_ALPHA] 
print(nominal_df.shape)
if DEBUG:
    # see bottom of nominally significant
    display(nominal_df.sort_values('p.value').tail(10))

(657, 9)


Unnamed: 0,feature,intercept,estimate,std.error,statistic,p.value,tissue,type,fdr_bh
95,ACSL3,0.343405,0.29341,0.149056,1.968457,0.049015,Entorhinal_cortex_Endothelial,region_broad_type,0.152978
208,HIST1H2AC,-0.694536,0.581462,0.29547,1.967923,0.049077,Middle_temporal_gyrus_Microglia,region_broad_type,0.152978
443,SAFB2,-0.241533,0.441522,0.22445,1.96713,0.049168,Middle_temporal_gyrus_Microglia,region_broad_type,0.153027
312,PAPSS2,0.095513,0.484774,0.246684,1.965159,0.049396,Entorhinal_cortex_Endothelial,region_broad_type,0.153228
19,RPL5,-0.580144,0.507079,0.258058,1.964984,0.049416,Middle_temporal_gyrus_Microglia,region_broad_type,0.153228
460,ZNF207,0.515025,-0.316048,0.160871,-1.964604,0.04946,Entorhinal_cortex_Endothelial,region_broad_type,0.153228
42,ADAMTS9-AS2,1.259122,-0.408146,0.207838,-1.963773,0.049556,Putamen_Endothelial,region_broad_type,0.153292
438,WDR7,0.781331,0.311031,0.158475,1.962657,0.049686,Middle_temporal_gyrus_Microglia,region_broad_type,0.153458
80,KANSL1L,0.465067,-0.594864,0.303211,-1.961877,0.049777,Subventricular_zone_Endothelial,region_broad_type,0.153504
28,RAB28,0.6044,0.375862,0.19165,1.961195,0.049856,Subventricular_zone_ExN,region_broad_type,0.153515


#### count of significant genes by brain region cell type

In [11]:
glmmtmb_results.loc[glmmtmb_results['fdr_bh'] < 0.05].tissue.value_counts()

Middle_temporal_gyrus_Microglia    5
Entorhinal_cortex_Endothelial      4
Subventricular_zone_ExN            4
Subventricular_zone_Endothelial    1
Name: tissue, dtype: int64

In [12]:
nominal_df.tissue.value_counts()

Middle_temporal_gyrus_Microglia      201
Entorhinal_cortex_Endothelial        181
Subventricular_zone_Endothelial      130
Middle_temporal_gyrus_Endothelial     58
Putamen_Endothelial                   44
Subventricular_zone_ExN               43
Name: tissue, dtype: int64

### format the data for input to glmmTMB
based on nominally significant results from discovery glmmTMB analysis

In [13]:
# for disc_cell_type, rep_cell_type in cell_abbr_mappings.items():
#     cell_names = [f'{region}_{disc_cell_type}' for region in REGIONS]
#     cell_name = f'Frontal_cortex_{rep_cell_type}'
#     # subset the adata by cell-type and features
#     features = nominal_df.loc[nominal_df.tissue.isin(cell_names)].feature.unique()
#     if len(features) > 0:
#         print(f'--- processing {cell_name} in parallel')
#         features = list(set(features) & set(adata.var.index))
#         adata_sub = subset_anndata(adata, rep_cell_type, features)
#         diffexp_group(adata_sub, cell_name)

based on all testable input for the discovery glmmTMB analysis

In [14]:
%%time

cmds = {}
for disc_cell_type, rep_cell_type in cell_abbr_mappings.items():
    cell_names = [f'{region}_{disc_cell_type}' for region in REGIONS]
    features = {}
    for cell_name in cell_names:
        in_file = temp_r_in_file.format(this_dir=quants_dir, name=cell_name)
        glmm_in_df = pl_read_csv(in_file)
        features = set(features) | set(glmm_in_df.columns)
    # need to deal with hyphen to underscore revert
    features = [st.replace('_', '-') for st in features]
    features = list(set(features) & set(adata.var.index))
    print(f'{rep_cell_type} might test {len(features)} features')
    adata_sub = subset_anndata(adata, rep_cell_type, features)
    cell_name = f'Frontal_cortex_{rep_cell_type}'
    p = Process(target=diffexp_group_wrapper,args=(adata_sub, cell_name))
    p.start()
    # Append process and key to keep track
    cmds[rep_cell_type] = p    
    # diffexp_group(adata_sub, cell_name)
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()    

ExN might test 13748 features
Oligo might test 11275 features
Astro might test 8403 features
InN might test 13758 features
OPC might test 5477 features
Micro might test 4702 features
Endo might test 1130 features

done with Frontal_cortex_Endo kept 1123 features and 1478 cells

done with Frontal_cortex_Micro kept 4295 features and 4098 cells

done with Frontal_cortex_OPC kept 5396 features and 3154 cells

done with Frontal_cortex_Astro kept 8072 features and 6645 cells

done with Frontal_cortex_InN kept 13399 features and 7663 cells

done with Frontal_cortex_Oligo kept 10909 features and 29091 cells

done with Frontal_cortex_ExN kept 13570 features and 22157 cells
CPU times: user 4min 4s, sys: 56.6 s, total: 5min 1s
Wall time: 52min 17s


In [15]:
!date

Thu Dec 14 15:17:07 UTC 2023
