# Check similarity between age effects in post mortern snRNA data
Compare LNG phase1 age efffects with CARD FCTX (Reed) and Chien et al FCTX, Jeffries et al FCTX

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_parquet, concat, DataFrame
from dask.dataframe import read_csv as dask_read_csv
from numpy import where

import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

from seaborn import heatmap

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### notebook variables

In [None]:
# variables and constants
pm_names = ['cortical_astrocytes', 'cortical_endothelial', 
            'cortical_excitatory', 'cortical_inhibitory', 
            'cortical_microglia', 'cortical_oligodendrocytes', 
            'cortical_opc']
meta_all_names = ['excitatory', 'inhibitory', 'microglia']
differentiad_abbrvs = ['FB', 'MG']
cell_type_renames = {'cortical_astrocytes': 'Post-Mortem Cortical Astrocytes',
                     'cortical_endothelial': 'Post-Mortem Cortical Endothelial',
                     'cortical_excitatory': 'Post-Mortem Cortical Excitatory Neurons',
                     'cortical_inhibitory': 'Post-Mortem Cortical Inhibitory Neurons',
                     'cortical_microglia': 'Post-Mortem Cortical Microglia',
                     'cortical_oligodendrocytes': 'Post-Mortem Cortical Oligodendrocytes',
                     'cortical_opc': 'Post-Mortem Cortical OPCs',
                     'excitatory': 'meta-Excitatory Neurons',
                     'inhibitory': 'meta-Inhibitory Neurons',
                     'FB': 'NIA iFBn',
                     'MG': 'NIA iMGL',
                     'microglia': 'meta-Microglia'}
DEBUG=True
ALPHA = 0.05
SPEARMAN_CUTOFF = 0.4

# directories
work_dir = '/mnt/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
results_dir = f'{work_dir}/results'
figures_dir = f'{work_dir}/figures'

# in files
phase1_results_file = f'{results_dir}/aging.glmmtmb_age_diffs.csv'
reed_results_file = f'{work_dir}/replication/aging_phase1_replication.glmmtmb_age_diffs.csv'
chien_results_file = '/mnt/labshare/raph/datasets/chien_fctx_snrna_aging/ageDEall.tsv.gz'
jeffries_results_file = '/mnt/labshare/raph/datasets/jeffries_snrna_aging_pfc/S6_DEGs_elderly_vs_adult.csv'

# out files
out_figure_prefix = f'{figures_dir}/figure_age_effect_similarity'

if DEBUG:
    print(f'{phase1_results_file=}')
    print(f'{reed_results_file=}')
    print(f'{chien_results_file=}')
    print(f'{jeffries_results_file=}')
    print(f'{out_figure_prefix=}')

#### functions

In [None]:
def peek_dataframe(df: DataFrame, message: str=None, verbose: bool=False):
    if not message is None and len(message) > 0:
        print(message)
    print(f'{df.shape=}')
    if verbose:
        display(df.head())

## load the LNG phase1 results

In [None]:
%%time
lng_results = read_csv(phase1_results_file)
peek_dataframe(lng_results, 'loaded LNG results', DEBUG)

In [None]:
display(lng_results.type.value_counts())
display(lng_results.groupby('type').tissue.value_counts())
display(lng_results.tissue.value_counts())

### for LNG drop the global broad, we only used broad cell-types within region and specific cell-types

In [None]:
lng_results = lng_results.loc[lng_results.type != 'broad_celltype']
peek_dataframe(lng_results, 'loaded LNG results', DEBUG)
display(lng_results.type.value_counts())
display(lng_results.groupby('type').tissue.value_counts())
display(lng_results.tissue.value_counts())

## load CARD FCTX (Reed) results

In [None]:
%%time
reed_results = read_csv(reed_results_file)
peek_dataframe(reed_results, 'loaded CARD results', DEBUG)

In [None]:
display(reed_results.type.value_counts())
display(reed_results.tissue.value_counts())

## load Chien et al FCTX results

In [None]:
%%time
chien_results = read_csv(chien_results_file, sep='\t', index_col=0)
peek_dataframe(chien_results, 'loaded Chien et al results', DEBUG)

### the Chien results has multiple gene IDs per gene name, do a simple keep first filtering

In [None]:
chien_results['unique_result'] = chien_results.gene_name + chien_results.celltype
print(chien_results.shape)
chien_results = chien_results.drop_duplicates('unique_result', keep='first')
chien_results = chien_results.drop(columns=['unique_result'])
peek_dataframe(chien_results, 'non duplicate gene results for Chien et al results', DEBUG)

In [None]:
display(chien_results.celltype.value_counts())

## load the Jeffries et al results

In [None]:
jeffries_results = read_csv(jeffries_results_file)
peek_dataframe(jeffries_results, 'loaded the Jeffries et al results', DEBUG)

## find the union of significant features across regions and cell-types

In [None]:
lng_features = lng_results.loc[lng_results.fdr_bh <= ALPHA].feature.unique()
reed_features = reed_results.loc[reed_results.fdr_bh <= ALPHA].feature.unique()
chien_features = chien_results.loc[chien_results['adj.P.Val'] <= ALPHA].gene_name.unique()
jeffries_features = jeffries_results.loc[jeffries_results['q-value'] <= ALPHA]['gene name'].unique()
print(f'{len(lng_features)=}')
print(f'{len(reed_features)=}')
print(f'{len(chien_features)=}')
print(f'{len(jeffries_features)=}')
features_union = (set(lng_features) | set(reed_features) 
                  | set(chien_features) | set(jeffries_features))
print(f'{len(features_union)=}')
# # now only keep union features if tested, regardless of significance, in all three results
# features_union = (features_union & set(lng_results.feature) & 
#                   set(reed_results.feature) & set(chien_results.gene_name) 
#                   & set(jeffries_features))
# print(f'{len(features_union)=}')

## subset and harminize neccessary columns names before merging age effect results
LNG and CARD (Reed) results regressed with same tooling so results format is the same. So just modify the Chien et al result's columns as needed.

In [None]:
chien_results = chien_results.rename(columns={'gene_name': 'feature', 
                                              'logFC': 'estimate', 
                                              'P.Value': 'p.value', 
                                              'celltype': 'tissue'})
chien_results['study'] = 'Chien'
peek_dataframe(chien_results, 'harmonized Chien et al column names', DEBUG)

jeffries_results = jeffries_results.rename(columns={'gene name': 'feature', 
                                              'log2 fold change of elderly vs adult': 'estimate', 
                                              'Wilcox\'s p-value': 'p.value', 
                                              'cell type': 'tissue'})
jeffries_results.tissue = 'PFC: ' + jeffries_results.tissue
jeffries_results['study'] = 'Jeffries'
peek_dataframe(jeffries_results, 'harmonized Jeffries et al column names', DEBUG)

lng_results['study'] = 'LNG'
reed_results['study'] = 'CARD'

In [None]:
keep_cols = ['feature', 'estimate', 'p.value', 'tissue', 'study']
lng_kept = lng_results.loc[lng_results.feature.isin(features_union), keep_cols]
reed_kept = reed_results.loc[reed_results.feature.isin(features_union), keep_cols]
chien_kept = chien_results.loc[chien_results.feature.isin(features_union), keep_cols]
jeffries_kept = jeffries_results.loc[jeffries_results.feature.isin(features_union), keep_cols]
peek_dataframe(lng_kept, 'subset LNG results', DEBUG)
peek_dataframe(reed_kept, 'subset CARD (Reed) results', DEBUG)
peek_dataframe(chien_kept, 'subset Chien et al results', DEBUG)
peek_dataframe(jeffries_kept, 'subset Jeffries et al results', DEBUG)

In [None]:
results_df = concat([lng_kept, reed_kept, chien_kept, jeffries_kept]) 
peek_dataframe(results_df, 'combine results', DEBUG)
if DEBUG:
    display(results_df.study.value_counts())
    display(results_df.tissue.value_counts())
    display(results_df.groupby('study').tissue.value_counts())

## format a wide matrix for running simple correlation

In [None]:
wide_results = (results_df[['feature', 'estimate', 'tissue']]
                .pivot(index='feature', columns='tissue', values='estimate'))
peek_dataframe(wide_results, 'wide pivot of results', DEBUG)

## generate the correlation matrix of the the cell-types

In [None]:
corr_matrix = wide_results.corr(method='spearman').round(2)
peek_dataframe(corr_matrix, 'Spearman correlation matrix of cell types', DEBUG)

## visualize the correlations for just the frontal cortex studies

In [None]:
jeffries_cols = sorted(list(set(corr_matrix.columns) & set(jeffries_results.tissue)))
chien_cols = sorted(list(set(corr_matrix.columns) & set(chien_results.tissue)))
frmt_corr_matrix = corr_matrix.loc[corr_matrix.columns.isin(jeffries_cols), chien_cols]

# Create the conditional mask (must use & for a range)
mask = (frmt_corr_matrix > SPEARMAN_CUTOFF) | (frmt_corr_matrix < -SPEARMAN_CUTOFF)
# Create the conditional annotation array
annot_array = where(mask, frmt_corr_matrix.values, '')

with rc_context({'figure.figsize': (9, 9), 'figure.dpi': 500}):
    heatmap(frmt_corr_matrix, linewidths=0.5,
            cmap='coolwarm', annot=annot_array, fmt='', 
            annot_kws={'size': 8}) #, vmin=-1, vmax=1, center=0)    
    plt.title('Age Effect Correlations between Frontal Cortex Studies')
    plt.xlabel('Chien et al')
    plt.ylabel('Jeffries et al')
    # plt.savefig(f'{out_figure_prefix}.png', bbox_inches='tight')
    # plt.savefig(f'{out_figure_prefix}.svg', bbox_inches='tight')
    plt.show()    

In [None]:
chien_cols = sorted(list(set(corr_matrix.columns) 
                         & (set(chien_results.tissue) | set(jeffries_results.tissue))))
reed_cols = sorted(list(set(corr_matrix.columns) & set(reed_results.tissue)))
frmt_corr_matrix = corr_matrix.loc[corr_matrix.columns.isin(chien_cols), reed_cols]

# Create the conditional mask (must use & for a range)
mask = (frmt_corr_matrix > SPEARMAN_CUTOFF) | (frmt_corr_matrix < -SPEARMAN_CUTOFF)
# Create the conditional annotation array
annot_array = where(mask, frmt_corr_matrix.values, '')

with rc_context({'figure.figsize': (9, 9), 'figure.dpi': 500}):
    heatmap(frmt_corr_matrix, linewidths=0.5,
            cmap='coolwarm', annot=annot_array, fmt='', 
            annot_kws={'size': 8}) #, vmin=-1, vmax=1, center=0)    
    plt.title('Age Effect Correlations between Frontal Cortex Studies')
    plt.xlabel('CARD (Reed)')
    plt.ylabel('Chien et al & Jeffries et al')
    # plt.savefig(f'{out_figure_prefix}.png', bbox_inches='tight')
    # plt.savefig(f'{out_figure_prefix}.svg', bbox_inches='tight')
    plt.show()    

## visualize the effects between all studies using LNG broad per region

In [None]:
fctx_cols = sorted(list(set(corr_matrix.columns) & 
                         (set(chien_results.tissue) 
                          | set(reed_results.tissue) 
                          | set(jeffries_results.tissue))))
temp_lng = lng_results.loc[lng_results.type == 'region_broad_celltype']
lng_cols = sorted(list(set(corr_matrix.columns) & set(temp_lng.tissue)))
frmt_corr_matrix = corr_matrix.loc[corr_matrix.columns.isin(lng_cols), fctx_cols]

# Create the conditional mask (must use & for a range)
mask = (frmt_corr_matrix > SPEARMAN_CUTOFF) | (frmt_corr_matrix < -SPEARMAN_CUTOFF)
# Create the conditional annotation array
annot_array = where(mask, frmt_corr_matrix.values, '')

with rc_context({'figure.figsize': (20, 18), 'figure.dpi': 100}):
    # plt.style.use('seaborn-v0_8-paper')
    heatmap(frmt_corr_matrix, linewidths=0.5,
            cmap='coolwarm', annot=annot_array, fmt='', 
            annot_kws={'size': 8})#, vmin=-1, vmax=1, center=0)    
    plt.title('Age Effect Correlations between Studies, region broad')
    plt.xlabel('Frontal Cortex Studies')
    plt.ylabel('LNG Phase1')
    # plt.savefig(f'{out_figure_prefix}.png', bbox_inches='tight')
    # plt.savefig(f'{out_figure_prefix}.svg', bbox_inches='tight')
    plt.show() 

## visualize the effects between all studies using LNG specific cell-types

In [None]:
fctx_cols = sorted(list(set(corr_matrix.columns) & 
                         (set(chien_results.tissue) 
                          | set(reed_results.tissue)
                          | set(jeffries_results.tissue))))
temp_lng = lng_results.loc[lng_results.type == 'specific_celltype']
lng_cols = sorted(list(set(corr_matrix.columns) & set(temp_lng.tissue)))
frmt_corr_matrix = corr_matrix.loc[corr_matrix.columns.isin(lng_cols), fctx_cols]

# Create the conditional mask (must use & for a range)
mask = (frmt_corr_matrix > SPEARMAN_CUTOFF) | (frmt_corr_matrix < -SPEARMAN_CUTOFF)
# Create the conditional annotation array
annot_array = where(mask, frmt_corr_matrix.values, '')

with rc_context({'figure.figsize': (20, 20), 'figure.dpi': 100}):
    # plt.style.use('seaborn-v0_8-paper')
    heatmap(frmt_corr_matrix, linewidths=0.5,
            cmap='coolwarm', annot=annot_array, fmt='', 
            annot_kws={'size': 8})#, vmin=-1, vmax=1, center=0)    
    plt.title('Age Effect Correlations between Studies, specific cell-types')
    plt.xlabel('Frontal Cortex Studies')
    plt.ylabel('LNG Phase1')
    # plt.savefig(f'{out_figure_prefix}.png', bbox_inches='tight')
    # plt.savefig(f'{out_figure_prefix}.svg', bbox_inches='tight')
    plt.show() 

In [None]:
!date