## Notebook to run post processing of differential expression in single-cell data for glm pseudo-bulk based analysis

basically 
- read glm results per region and cell-type and then integrate them
- apply B&H FDR 
- take a look at overlap between brain regions and cell-types do some sample plotting

In [None]:
!date

#### import libraries

In [None]:
from anndata import AnnData
import numpy as np
from pandas import DataFrame, concat, read_csv, Series, set_option as pd_set_option
import scanpy as sc
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context
import json
from os.path import exists
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter('ignore')

import random
random.seed(420)

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase1'
set_name = f'{project}_replication'
cohort = 'aging'

# directories for initial setup
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
replication_dir = f'{wrk_dir}/replication'

# in files
anndata_file = f'{replication_dir}/{set_name}.scvi.h5ad'
tissue_result_file = '{this_dir}/{name}_glm_pb_age_diffs.csv'

# out files
results_file = f'{replication_dir}/{set_name}.glm_pb_age_diffs.csv'
results_fdr_file = f'{replication_dir}/{set_name}.glm_pb_age_diffs_fdr.csv'

# constants
DEBUG = True
SCVI_NORMALIZED_KEY = 'scvi_normalized'
pd_set_option('display.max_rows', 500)

#### analysis functions

In [None]:
def read_glm_results(cell_name: str, result_type: str, in_file: str) -> DataFrame:
    this_df = read_csv(in_file)
    this_df['tissue'] = cell_name
    this_df['type'] = result_type
    return this_df

def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p-value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, 
                                method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

def compute_frmt_pb(df: DataFrame, feature: str) -> DataFrame:
    ret_df = df[[feature, 'Sample_ID']].groupby('Sample_ID').mean()
    ret_df = ret_df.merge(df[['Sample_ID', 'old', 'female']].drop_duplicates(), 
                          how='left', left_index=True, right_on='Sample_ID')
    return ret_df

def plot_feature_by_age_group(df: DataFrame, x_term: str, y_term: str):
    plt.figure(figsize=(9,9))
    pb_df = compute_frmt_pb(df, y_term)
    sns.boxenplot(x=x_term,y=y_term, scale='exponential', data=pb_df,
                  k_depth='trustworthy')
    grsplt = sns.stripplot(x=x_term,y=y_term, data=pb_df, alpha=0.75,
                           jitter=True, color='darkgrey')
    plt.title(f'{y_term} ~ {x_term}', fontsize='large') 
    plt.xlabel(x_term)
    plt.ylabel(y_term)
    plt.show()
    
def plot_feature_by_sample(df: DataFrame, x_term: str, y_term: str):
    # set up order by young then old
    temp = df.groupby('Age_group')['Sample_ID'].unique()
    this_list = temp['young'].to_list() + temp['old'].to_list() 
    plt.figure(figsize=(9,9))
    sns.boxenplot(x='Sample_ID',y=y_term, scale='exponential', data=df,
                  k_depth='trustworthy', hue=x_term, order=this_list)
    grsplt = sns.stripplot(x='Sample_ID',y=y_term, data=df, alpha=0.75,
                           jitter=True, color='darkgrey', order=this_list)
    plt.xticks(rotation=75)
    plt.title(f'{y_term} ~ {x_term}', fontsize='large') 
    plt.xlabel('Sample')
    plt.ylabel(y_term)
    plt.show()    
    
def volcano_plot(df: DataFrame, x_term: str='coef', y_term: str='p-value', 
                 alpha: float=0.05, adj_p_col: str='fdr_bh', title: str=None, 
                 filter_nseeff: bool=True, extreme_size: float=10.0):
    if filter_nseeff:
        df = df.loc[((-extreme_size < df[x_term]) & 
                    (df[x_term] < extreme_size) &
                    (~df['z'].isna()) | 
                    (df[adj_p_col] < alpha))]
    plt.figure(figsize=(9,9))
    log_pvalue = -np.log10(df[y_term])
    is_sig = df[adj_p_col] < alpha
    sns.scatterplot(x=x_term, y=log_pvalue, data=df, hue=is_sig, palette='Purples')
    plt.title(title)
    plt.xlabel('effect')
    plt.ylabel('-log10(p-value)')
    plt.show()
    
def prep_plot_feature(data: AnnData, feature_results: Series, 
                      group: str='old'):
    this_df = convert_ad_to_df(data)
    print(feature_results)
    sns.set_theme(style='white', palette='Paired', font_scale=1.2)
    plot_feature_by_age_group(this_df, group, feature_results.feature)
    plot_feature_by_sample(this_df, group, feature_results.feature)
    
def subset_anndata(data: AnnData, cell_name: str, reapply_filter: bool=True, 
                   min_cell_count: int=3, verbose: bool=False) -> AnnData:
    this_data = data[(data.obs.Cell_type == cell_name)].copy()
    shape_before = this_data.shape
    if reapply_filter:
        sc.pp.filter_genes(this_data, min_counts=min_cell_count)
        sc.pp.filter_cells(this_data, min_counts=min_cell_count)
        shape_after = this_data.shape
    if verbose:
        print(f'subset complete, shape before and after: {shape_before} {shape_after}')
        print(this_data)
    return this_data  

def scale_dataframe(this_df : DataFrame):
    scaledX = MinMaxScaler().fit_transform(this_df)
    scaled_df = DataFrame(data=scaledX, columns=this_df.columns, 
                          index=this_df.index) 
    return scaled_df

def convert_ad_to_df(data: AnnData, young_age_limit: float=30.0, 
                     scale: bool=True, verbose: bool=False) -> DataFrame:
    data_df = data.to_df(SCVI_NORMALIZED_KEY)
    if scale:
        data_df = scale_dataframe(data_df)
    annots = data.obs[['Sample_ID', 'Age','Sex']].copy()
    annots['old'] = np.where((annots['Age'] > young_age_limit), 1, 0)
    annots['Age_group'] = np.where((annots['Age'] > young_age_limit), 'old', 'young')
    annots['female'] = np.where((annots['Sex'] == 'female'), 1, 0)
    this_df = None
    if data_df.index.equals(annots.index):
        this_df = concat([data_df, annots], axis='columns')
        this_df.index.name = 'barcodekey'
        if verbose:
            print(f'anndata to pandas df complete: {this_df.shape}')
            print(this_df.shape)
            display(this_df.head())
    return this_df

### load discovery cohort data

#### read the anndata (h5ad) file

In [None]:
%%time
adata = sc.read(anndata_file, cache=True)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

#### take a look at the cell counts by cell type

### get the lists of broad cell types

In [None]:
broad_cell_types = list(adata.obs.Cell_type.unique())
print(len(broad_cell_types))
print(broad_cell_types)
brain_regions = list(adata.obs.Brain_region.unique())
print(len(brain_regions))
print(brain_regions)

### read the diff by age results by region and cell-type

In [None]:
%%time

glm_results = None
for cell_type in broad_cell_types:
    cell_name = f'Frontal_cortex_{cell_type}'
    print(f'--- {cell_name}')
    this_type = 'broad_celltype'
    this_file = tissue_result_file.format(this_dir=replication_dir, 
                                          name=cell_name.replace(" ", "_"))
    if exists(this_file):
        glm_results = concat([glm_results, read_glm_results(cell_name, this_type, 
                                                            this_file)])

In [None]:
print(f'shape of all load results {glm_results.shape}')
if DEBUG:
    display(glm_results.type.value_counts())
    display(glm_results.groupby('type').tissue.value_counts())    
    display(glm_results.sample(5))

### compute the FDR values

In [None]:
glm_results['p-value'] = glm_results['p-value'].fillna(1)
glm_results = compute_bh_fdr(glm_results)
print(glm_results.shape)
if DEBUG:
    display(glm_results.sort_values('fdr_bh').head())

In [None]:
with rc_context({'figure.figsize': (9, 9)}):
    sns.scatterplot(data=glm_results, x=glm_results['fdr_bh'], y=glm_results['p-value'])
    plt.axhline(y=0.05, linestyle='--')
    plt.axvline(x=0.05, linestyle='--')
    plt.show()

### count of significant genes by brain region

In [None]:
print(glm_results.loc[glm_results['fdr_bh'] < 0.05]['tissue'].nunique())
display(glm_results.loc[glm_results['fdr_bh'] < 0.05].groupby('type').tissue.value_counts())

### save results

#### save the full results

In [None]:
glm_results.to_csv(results_file, index=False)

#### save the statistically significant results

In [None]:
glm_results.loc[glm_results['fdr_bh'] < 0.05].to_csv(results_fdr_file, index=False)

### visualize results

#### visualize volcano plots

In [None]:
volcano_plot(glm_results, title='all_results')

print('### broad cell-types without regard for region')
for cell_type in broad_cell_types:
    cell_name = f'Frontal_cortex_{cell_type}'
    print(f'--- {cell_name}')
    this_type = 'broad_celltype'
    volcano_plot(glm_results.loc[(glm_results.tissue == cell_name) & 
                                 (glm_results.type == this_type)], title=cell_name)

#### look at some of the individual results

In [None]:
# max significant by p-value
this_results = glm_results.loc[glm_results['p-value'] == min(glm_results['p-value'])]
this_hit = this_results.sort_values(by=['coef'], ascending=False).iloc[0]
broad_cell_name = this_hit.tissue.replace('Frontal_cortex_', '')
adata_sub = subset_anndata(adata, broad_cell_name)
prep_plot_feature(adata_sub, this_hit)

In [None]:
# min significant by coef (increasing)
sig_results = glm_results.loc[glm_results['fdr_bh'] < 0.05]
this_results = sig_results.loc[sig_results['coef'] == min(sig_results['coef'])]
this_hit = this_results.sort_values(by=['coef'], ascending=False).iloc[0]
broad_cell_name = this_hit.tissue.replace('Frontal_cortex_', '')
adata_sub = subset_anndata(adata, broad_cell_name)
prep_plot_feature(adata_sub, this_hit)

In [None]:
# random
this_hit = sig_results.sample().iloc[0]
broad_cell_name = this_hit.tissue.replace('Frontal_cortex_', '')
adata_sub = subset_anndata(adata, broad_cell_name)
prep_plot_feature(adata_sub, this_hit)

In [None]:
# max non-significat by coef (increasing)
nonsig_results = glm_results.loc[(glm_results['fdr_bh'] > 0.05) & 
                                 (~glm_results['z'].isna())]
this_results = nonsig_results.loc[nonsig_results['coef'] == max(nonsig_results['coef'])]
this_hit = this_results.iloc[0]
broad_cell_name = this_hit.tissue.replace('Frontal_cortex_', '')
adata_sub = subset_anndata(adata, broad_cell_name)
prep_plot_feature(adata_sub, this_hit)