## Notebook to prepare the input files from the replication data for analysis with glmmTMB

In [1]:
!date

Mon Jan  8 18:21:55 EST 2024


#### import libraries

In [2]:
import scanpy as sc
from os.path import exists
from pandas import DataFrame, concat, read_csv
from anndata import AnnData
import numpy as np
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
import json
from statsmodels.stats.multitest import multipletests
from polars import read_csv as pl_read_csv
from multiprocessing import Process

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [3]:
# naming
project = 'aging_phase1'
set_name = f'{project}_replication'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
replication_dir = f'{wrk_dir}/replication'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'
sc.settings.figdir = f'{figures_dir}/'

# in files
anndata_file = f'{replication_dir}/{set_name}.scvi.h5ad'
temp_name_remap_json = '{this_dir}/{name}_gene_name_remap_temp.csv'
temp_r_out_file = '{this_dir}/aging.{name}_glmmtmb_results_temp.csv'

# out files
temp_r_in_file = '{this_dir}/{name}_glmmtmb_in_df_temp.csv'


# variables
DEBUG = True
REGIONS = ['Middle_temporal_gyrus', 'Putamen', 
           'Entorhinal_cortex', 'Subventricular_zone']
CELLTYPES=['ExN', 'Oligodendrocyte', 'Astrocyte', 'InN', 'OPC', 'Mural', 
           'Microglia', 'SPN', 'Endothelial', 'Ependymal']
cell_abbr_mappings = {'ExN': 'ExN', 'Oligodendrocyte': 'Oligo', 'Astrocyte': 'Astro', 
                      'InN': 'InN', 'OPC': 'OPC', 'Microglia': 'Micro', 'Endothelial': 'Endo'}
MAX_ALPHA = 0.05
SCVI_NORMALIZED_KEY = 'scvi_normalized'

#### functions

In [4]:
def read_feature_renamed_map(cell_name: str) -> dict:
    # read dict from json file
    rename_cols = json.load(open(temp_name_remap_json.format(this_dir=quants_dir,
                                                             name=cell_name.replace(" ", "_"))))
    return rename_cols

def reformat_glmmtmb_df(df: DataFrame) -> DataFrame:
    # reformat results into one row per feature
    temp_term = df.loc[df['term'] == 'old'].copy()
    temp_intercepts = df.loc[df['term'] == '(Intercept)', ['feature', 'estimate']].copy()
    temp_intercepts = temp_intercepts.rename(columns={'estimate': 'intercept'})
    this_df = temp_term.merge(temp_intercepts, how='inner', on='feature')
    return this_df[['feature', 'intercept', 'estimate', 'std.error', 'statistic', 'p.value']]

def read_glmmtmb_results(cell_name: str, group_type: str, cols_to_rename: dict) -> DataFrame:
    this_file = temp_r_out_file.format(this_dir=f'{results_dir}',
                                       name=cell_name.replace(" ", "_"))
    this_df = read_csv(this_file)
    # need to flip the features with '-' -> '_' for R back to originals
    # the the key/values
    rename_cols = {value: key for (key, value) in cols_to_rename.items()}
    this_df['feature'] = this_df['feature'].replace(rename_cols)
    this_df = reformat_glmmtmb_df(this_df)
    this_df['tissue'] = cell_name
    this_df['type'] = group_type     
    return this_df

def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p.value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

def subset_anndata(data: AnnData, cell_name: str, features: list, reapply_filter: bool=True, 
                   min_cell_count: int=3, verbose: bool=False) -> AnnData:
    this_data = data[(data.obs.Cell_type == cell_name),features].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'subset complete, shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data

def convert_ad_to_df(data: AnnData, young_age_limit: float=30.0, 
                     verbose: bool=False) -> DataFrame:
    data_df = data.to_df(SCVI_NORMALIZED_KEY)
    annots = data.obs[['Sample_ID', 'Age','Sex']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['female'] = np.where((annots['Sex'] == 'Female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = concat([data_df, annots], axis='columns')
        this_df.index.name = 'barcodekey'
        if verbose:
            print(f'anndata to pandas df complete: {this_df.shape}')
            print(this_df.shape)
            display(this_df.head())
    return this_df

def feature_detected(feature_col, features: list=None, df: DataFrame=None, 
                     min_cell_count: int=3, min_sample_det_rate: float=0.5,
                     verbose: bool=False):    
    good_feature = True
    if feature_col.name in features:
        nz_df = feature_col[feature_col > 0]
        ok_cnts = df.loc[nz_df.index].Sample_ID.value_counts() > min_cell_count
        ok_sample_cnt = ok_cnts.sum()
        unique_sample_id_count = df.Sample_ID.nunique()
        good_feature = ok_sample_cnt / unique_sample_id_count >= min_sample_det_rate
        if verbose:
            print(feature_col.name, end=', ')
            print(f'nz_df.shape = {nz_df.shape}', end=', ')
            print(f'{ok_sample_cnt}/{unique_sample_id_count}', end=', ')
            print(good_feature)
    return good_feature

def poorly_detected_features(features: list=None, df: DataFrame=None, 
                             verbose=False) -> list:
    feature_detect_df = df.apply(feature_detected, features=features, df=df)
    bad_features = feature_detect_df.loc[~feature_detect_df].index.to_list()
    if verbose:
        print(f'bad features counts is {len(bad_features)}')
    return bad_features

def save_df_for_glmmtmb_in_r(df: DataFrame, cell_name: str):
    # R doesn't like column names with hyphens in 
    # data frames when building formulas so replace temporarily
    # find features containing hyphen
    feats_w_hyphen = df.columns[df.columns.str.contains('-')]
    # make dictionary to do replace
    rename_cols = {x: x.replace('-', '_') for x in feats_w_hyphen}
    df = df.rename(columns=rename_cols)
    df.to_csv(temp_r_in_file.format(this_dir=f'{replication_dir}', 
                                    name=cell_name.replace(" ", "_")))
    # save to gene remame dict
    json.dump(rename_cols, 
              open(temp_name_remap_json.format(this_dir=f'{replication_dir}',
                                               name=cell_name.replace(" ", "_")), 'w'))

def diffexp_group(data: AnnData, cell_name: str,
                  min_cell_count: int=3, 
                  verbose: bool=False) -> str:
    type_ad = data
    if verbose:
        print('converting anndata to pandas df')    
    type_df = convert_ad_to_df(data)
    # find features poorly detected and don't include in analysis
    if verbose:
        print(f'finding poorly detected features from cells x features {type_df.shape}')    
    bad_features = poorly_detected_features(data.var.index.values, type_df)
    type_clean_df = type_df.drop(columns=bad_features)
    keep_features = set(data.var.index) & set(type_clean_df.columns)
    if verbose:
        print(f'formatting glmmTMB command for {len(keep_features)} features and {type_clean_df.shape[0]} cells')    
    this_cmd = save_df_for_glmmtmb_in_r(type_clean_df, cell_name)
    print(f'\ndone with {cell_name} kept {len(keep_features)} features and {type_clean_df.shape[0]} cells')
    # if verbose:
    #     print(f'done', end='. ')
    return this_cmd

def diffexp_group_wrapper(data: AnnData, cell_name: str):
    diffexp_group(data, cell_name)

### load data

#### load replication data

In [5]:
%%time
adata = sc.read(anndata_file, cache=True)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

AnnData object with n_obs × n_vars = 74291 × 21994
    obs: 'Sample_ID', 'Sex', 'Age', 'Barcode', 'Study', 'Study_type', 'Batch', 'Cluster', 'Cell_type', 'Brain_region', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', '_scvi_batch', '_scvi_labels', 'leiden_scVI'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Brain_region_colors', 'Cell_type_colors', 'Cluster_colors', 'Study_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'leiden', 'leiden_scVI_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    layers: 'counts', 'scvi_normalized'
    obsp: 'connectivities', 'distances'


Unnamed: 0,Sample_ID,Sex,Age,Barcode,Study,Study_type,Batch,Cluster,Cell_type,Brain_region,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,n_counts,_scvi_batch,_scvi_labels,leiden_scVI
TAGGGTTTCAGCATTA-1,UMARY-1540,male,28.0,TAGGGTTTCAGCATTA-1,Reed,replication,,SFG:Oligo.2,Oligo,frontal cortex,1299,2761.0,4.0,0.144875,2761.0,3,0,0
ACCTTGTGTACCGTAA-1,UMARY-1540,male,28.0,ACCTTGTGTACCGTAA-1,Reed,replication,,SFG:Inh.4,InN,frontal cortex,3419,8816.0,11.0,0.124773,8816.0,3,0,16
GCCCTCATCCCTCAGT-1-1,UMARY-5123,male,61.0,GCCCTCATCCCTCAGT-1-1,Reed,replication,,SFG:Exc.2,ExN,frontal cortex,2654,5405.0,416.0,7.696577,5405.0,3,0,9
AGCTATATCCCGAACA-1,UMARY-1818,male,76.0,AGCTATATCCCGAACA-1,Reed,replication,,SFG:Oligo.2,Oligo,frontal cortex,1857,3906.0,32.0,0.819252,3906.0,3,0,0
AAGGATGTCGCGCTAA-1,UMARY-4789,female,72.0,AAGGATGTCGCGCTAA-1,Reed,replication,,SFG:Micro,Micro,frontal cortex,1478,2389.0,7.0,0.29301,2389.0,3,0,10


CPU times: user 668 ms, sys: 3.02 s, total: 3.69 s
Wall time: 3.69 s


In [6]:
adata.obs.Sex.value_counts()

Sex
male      46203
female    28088
Name: count, dtype: int64

In [7]:
display(adata.obs.Cell_type.value_counts())

Cell_type
Oligo    29133
ExN      22136
InN       7658
Astro     6720
Micro     3969
OPC       3172
Endo      1503
Name: count, dtype: int64

In [8]:
display(adata.obs.Sample_ID.value_counts())

Sample_ID
UMARY-5088    11236
UMARY-5171     8568
UMARY-1540     8475
UMARY-1710     8157
UMARY-4789     7266
UMARY-1541     7130
UMARY-4546     5535
UMARY-1818     4823
UMARY-602      4694
UMARY-5123     4025
UMARY-4726     2936
UMARY-5028     1446
Name: count, dtype: int64

#### load discovery results
use this to determine which features in what cell-types need to be analyzed

In [9]:
glmmtmb_results = None
for region in REGIONS:
    for cell_type in cell_abbr_mappings.keys():
        print(region, cell_type)
        in_results_file = f'{results_dir}/aging.{region}_{cell_type}_glmmtmb_results_temp.csv'
        if exists(in_results_file):
            this_tissue = f'{region}_{cell_type}'
            renamed_features = read_feature_renamed_map(this_tissue)
            glmmtmb_results = concat([glmmtmb_results, 
                                      read_glmmtmb_results(this_tissue, 'region_broad_type',
                                                           renamed_features)])
            print('Done.')
        else:
            print('Not found, skipping.')        

Middle_temporal_gyrus ExN
Done.
Middle_temporal_gyrus Oligodendrocyte
Done.
Middle_temporal_gyrus Astrocyte
Done.
Middle_temporal_gyrus InN
Done.
Middle_temporal_gyrus OPC
Done.
Middle_temporal_gyrus Microglia
Done.
Middle_temporal_gyrus Endothelial
Done.
Putamen ExN
Not found, skipping.
Putamen Oligodendrocyte
Done.
Putamen Astrocyte
Done.
Putamen InN
Done.
Putamen OPC
Done.
Putamen Microglia
Done.
Putamen Endothelial
Done.
Entorhinal_cortex ExN
Done.
Entorhinal_cortex Oligodendrocyte
Done.
Entorhinal_cortex Astrocyte
Done.
Entorhinal_cortex InN
Done.
Entorhinal_cortex OPC
Done.
Entorhinal_cortex Microglia
Done.
Entorhinal_cortex Endothelial
Done.
Subventricular_zone ExN
Done.
Subventricular_zone Oligodendrocyte
Done.
Subventricular_zone Astrocyte
Done.
Subventricular_zone InN
Done.
Subventricular_zone OPC
Done.
Subventricular_zone Microglia
Done.
Subventricular_zone Endothelial
Done.


### compute the FDR values

In [10]:
glmmtmb_results['p.value'] = glmmtmb_results['p.value'].fillna(1)
glmmtmb_results = compute_bh_fdr(glmmtmb_results)
print(f'results shape is {glmmtmb_results.shape}')
if DEBUG:
    display(glmmtmb_results.sort_values('fdr_bh').head(10))

total significant after correction: (9955, 9)
results shape is (120457, 9)


Unnamed: 0,feature,intercept,estimate,std.error,statistic,p.value,tissue,type,fdr_bh
11666,RHBDL3,-1.154267,1.428514,0.107231,13.32179,1.729114e-40,Middle_temporal_gyrus_ExN,region_broad_type,2.082839e-35
3496,AC107208.1,-2.898783,1.065512,0.099695,10.687679,1.1624309999999999e-26,Entorhinal_cortex_ExN,region_broad_type,7.001145e-22
664,CLK1,-0.025009,0.492306,0.054011,9.11489,7.875025e-20,Middle_temporal_gyrus_Oligodendrocyte,region_broad_type,3.162006e-15
4395,LINC01285,-2.111554,1.178545,0.131453,8.965509,3.088503e-19,Middle_temporal_gyrus_Oligodendrocyte,region_broad_type,9.300796e-15
2421,ADM,-1.851778,1.353658,0.154116,8.783391,1.586179e-18,Subventricular_zone_Astrocyte,region_broad_type,3.821328e-14
990,AF279873.3,-1.060126,1.767797,0.201779,8.761072,1.93402e-18,Middle_temporal_gyrus_Astrocyte,region_broad_type,3.882772e-14
4347,HEPH,-1.515332,0.894057,0.1033,8.654944,4.9315360000000004e-18,Middle_temporal_gyrus_Oligodendrocyte,region_broad_type,8.486257e-14
2890,AC090015.1,0.577153,0.380033,0.044414,8.55655,1.162958e-17,Middle_temporal_gyrus_Oligodendrocyte,region_broad_type,1.556516e-13
8546,EXPH5,-0.080406,-0.931933,0.108822,-8.563825,1.091835e-17,Middle_temporal_gyrus_ExN,region_broad_type,1.556516e-13
2715,ADAMTS12,-1.549011,-0.953554,0.11306,-8.434065,3.338625e-17,Middle_temporal_gyrus_InN,region_broad_type,3.656007e-13


#### how many are 'nominally' significant

In [11]:
nominal_df = glmmtmb_results.loc[glmmtmb_results['p.value'] < MAX_ALPHA] 
print(nominal_df.shape)
if DEBUG:
    # see bottom of nominally significant
    display(nominal_df.sort_values('p.value').tail(10))

(37446, 9)


Unnamed: 0,feature,intercept,estimate,std.error,statistic,p.value,tissue,type,fdr_bh
828,PTENP1-AS,-1.81721,0.653339,0.333314,1.960129,0.049981,Middle_temporal_gyrus_OPC,region_broad_type,0.160808
911,AC114956.3,-1.22531,0.401744,0.204958,1.960126,0.049981,Putamen_Astrocyte,region_broad_type,0.160808
1953,VDAC2,-0.161835,-0.237274,0.121052,-1.960101,0.049984,Putamen_InN,region_broad_type,0.160808
33,KCNAB2,-2.208051,-0.163331,0.083328,-1.960099,0.049984,Subventricular_zone_Oligodendrocyte,region_broad_type,0.160808
782,KALRN,0.694318,-0.10506,0.053599,-1.960097,0.049984,Entorhinal_cortex_OPC,region_broad_type,0.160808
3288,HMGN4,-0.965503,-0.252273,0.128705,-1.960094,0.049985,Middle_temporal_gyrus_InN,region_broad_type,0.160808
7274,AC138305.1,-3.730984,0.277322,0.141485,1.960077,0.049987,Subventricular_zone_Oligodendrocyte,region_broad_type,0.160808
280,SUCO,-0.178578,0.124541,0.063539,1.960068,0.049988,Middle_temporal_gyrus_Oligodendrocyte,region_broad_type,0.160808
134,ALPL,-1.368566,-0.595521,0.303827,-1.960062,0.049989,Middle_temporal_gyrus_ExN,region_broad_type,0.160808
13828,RWDD2B,-1.918033,-0.347216,0.177154,-1.959968,0.05,Entorhinal_cortex_InN,region_broad_type,0.160839


#### count of significant genes by brain region cell type

In [12]:
glmmtmb_results.loc[glmmtmb_results['fdr_bh'] < 0.05].tissue.value_counts()

tissue
Entorhinal_cortex_InN                    1269
Middle_temporal_gyrus_Oligodendrocyte    1090
Middle_temporal_gyrus_InN                1043
Entorhinal_cortex_ExN                     923
Entorhinal_cortex_Astrocyte               714
Middle_temporal_gyrus_ExN                 697
Entorhinal_cortex_Oligodendrocyte         601
Subventricular_zone_Oligodendrocyte       486
Subventricular_zone_Astrocyte             397
Entorhinal_cortex_OPC                     395
Subventricular_zone_Microglia             339
Putamen_Astrocyte                         337
Putamen_OPC                               277
Subventricular_zone_InN                   207
Subventricular_zone_OPC                   207
Middle_temporal_gyrus_Astrocyte           195
Putamen_Oligodendrocyte                   185
Entorhinal_cortex_Microglia               142
Putamen_Microglia                         134
Middle_temporal_gyrus_OPC                 131
Putamen_InN                               115
Middle_temporal_gyrus_Micro

In [13]:
nominal_df.tissue.value_counts()

tissue
Entorhinal_cortex_InN                    5261
Middle_temporal_gyrus_InN                3672
Entorhinal_cortex_ExN                    3155
Middle_temporal_gyrus_Oligodendrocyte    2392
Entorhinal_cortex_Astrocyte              2348
Middle_temporal_gyrus_ExN                2265
Entorhinal_cortex_Oligodendrocyte        2231
Subventricular_zone_Oligodendrocyte      1908
Subventricular_zone_Astrocyte            1691
Entorhinal_cortex_OPC                    1667
Subventricular_zone_Microglia            1328
Subventricular_zone_InN                  1212
Putamen_Astrocyte                        1097
Subventricular_zone_OPC                  1059
Putamen_OPC                               909
Middle_temporal_gyrus_Astrocyte           858
Putamen_Oligodendrocyte                   851
Putamen_Microglia                         835
Entorhinal_cortex_Microglia               740
Middle_temporal_gyrus_OPC                 688
Putamen_InN                               622
Middle_temporal_gyrus_Micro

### format the data for input to glmmTMB
based on nominally significant results from discovery glmmTMB analysis

In [14]:
# for disc_cell_type, rep_cell_type in cell_abbr_mappings.items():
#     cell_names = [f'{region}_{disc_cell_type}' for region in REGIONS]
#     cell_name = f'Frontal_cortex_{rep_cell_type}'
#     # subset the adata by cell-type and features
#     features = nominal_df.loc[nominal_df.tissue.isin(cell_names)].feature.unique()
#     if len(features) > 0:
#         print(f'--- processing {cell_name} in parallel')
#         features = list(set(features) & set(adata.var.index))
#         adata_sub = subset_anndata(adata, rep_cell_type, features)
#         diffexp_group(adata_sub, cell_name)

based on all testable input for the discovery glmmTMB analysis

In [15]:
%%time

cmds = {}
for disc_cell_type, rep_cell_type in cell_abbr_mappings.items():
    cell_names = [f'{region}_{disc_cell_type}' for region in REGIONS]
    features = {}
    for cell_name in cell_names:
        in_file = temp_r_in_file.format(this_dir=quants_dir, name=cell_name)
        glmm_in_df = pl_read_csv(in_file)
        features = set(features) | set(glmm_in_df.columns)
    # need to deal with hyphen to underscore revert
    features = [st.replace('_', '-') for st in features]
    features = list(set(features) & set(adata.var.index))
    print(f'{rep_cell_type} might test {len(features)} features')
    adata_sub = subset_anndata(adata, rep_cell_type, features)
    cell_name = f'Frontal_cortex_{rep_cell_type}'
    p = Process(target=diffexp_group_wrapper,args=(adata_sub, cell_name))
    p.start()
    # Append process and key to keep track
    cmds[rep_cell_type] = p    
    # diffexp_group(adata_sub, cell_name)
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()    

ExN might test 13748 features
Oligo might test 11275 features
Astro might test 8403 features
InN might test 13759 features
OPC might test 5477 features
Micro might test 4702 features
Endo might test 1130 features

done with Frontal_cortex_Endo kept 1130 features and 1503 cells

done with Frontal_cortex_Micro kept 4691 features and 3969 cells

done with Frontal_cortex_OPC kept 5476 features and 3172 cells

done with Frontal_cortex_Astro kept 8398 features and 6720 cells

done with Frontal_cortex_InN kept 13755 features and 7658 cells

done with Frontal_cortex_Oligo kept 11273 features and 29133 cells

done with Frontal_cortex_ExN kept 13743 features and 22136 cells
CPU times: user 1min 45s, sys: 2min 17s, total: 4min 3s
Wall time: 1h 29min 49s


In [16]:
!date

Mon Jan  8 19:51:58 EST 2024
