## Compare results from replication analysis against the discovery data

- compare the replication frontal cortex broad cell-type results against the discovery non-region specific broad cell-types and the discovery region specific broad cell-types
- use regression to model the effect coeffecients and score between the replication and discovery

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame, concat
from numpy import corrcoef
from seaborn import heatmap, lmplot, set_theme as sns_set_theme
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase1'
set_name = f'{project}_replication'
cohort = 'aging'

# directories for initial setup
wrk_dir = '/home/jupyter/brain_aging_phase1'
results_dir = f'{wrk_dir}/results'
replication_dir = f'{wrk_dir}/replication'

# in files
discovery_results_file = f'{results_dir}/{cohort}.glmmtmb_age_diffs.csv'
replication_results_file = f'{replication_dir}/{set_name}.glmmtmb_age_diffs.csv'

# out files


# constants
DEBUG = True
DISC_REGIONS = ['Entorhinal cortex', 'Middle temporal gyrus', 
                'Putamen', 'Subventricular zone']
DISC_CELLTYPES=['Astrocyte', 'Endothelial', 'Ependymal', 'ExN', 'InN', 
                'Microglia', 'Mural', 'Oligodendrocyte', 'OPC', 'SPN']
REP_REGION = 'Frontal cortex'
cell_abbr_mappings = {'ExN': 'ExN', 'Oligodendrocyte': 'Oligo', 'Astrocyte': 'Astro', 
                      'InN': 'InN', 'OPC': 'OPC', 'Microglia': 'Micro', 'Endothelial': 'Endo'}


sns_set_theme(style='white', palette='Paired', font_scale=1.2)

### load the results

#### load the discovery results

In [None]:
disc_df = read_csv(discovery_results_file)
print(f'shape of discovery results {disc_df.shape}')
if DEBUG:
    display(disc_df.sample(5))

##### what type of analysis results are present

In [None]:
for result_type in disc_df.type.unique():
    print(result_type)
    print(disc_df.loc[disc_df.type == result_type].tissue.unique())

##### drop the cluster specific cell-types from discovery result
the replication only used broad cell-types

In [None]:
disc_df = disc_df.loc[disc_df.type != 'specific_celltype']
print(f'new shape of discovery results {disc_df.shape}')
if DEBUG:
    display(disc_df.sample(5))
display(disc_df.type.value_counts())    

#### load the replication results

In [None]:
rep_df = read_csv(replication_results_file)
print(f'shape of replication results {rep_df.shape}')
if DEBUG:
    display(rep_df.sample(5))

##### what type of analysis results are present

In [None]:
for result_type in rep_df.type.unique():
    print(result_type)
    print(rep_df.loc[rep_df.type == result_type].tissue.unique())

### check all replication and discovary tissue pairings by FDR

In [None]:
fdr_intersects = []
for disc_tissue in disc_df.tissue.unique():
    disc_tissue_sig = disc_df.loc[(disc_df.tissue == disc_tissue) & (disc_df.fdr_bh < 0.05)]
    for rep_tissue in rep_df.tissue.unique():
        rep_tissue_sig = rep_df.loc[(rep_df.tissue == rep_tissue) & (rep_df.fdr_bh < 0.05)]
        shared_cnt = len(set(disc_tissue_sig.feature) & set(rep_tissue_sig.feature))
        percent_shared = (shared_cnt/disc_tissue_sig.shape[0])*100
        # if DEBUG:
        #     print(f'{disc_tissue} : {rep_tissue} = {percent_shared:.3f}%')
        fdr_intersects.append((disc_tissue, rep_tissue, percent_shared))
fdr_results_df = DataFrame(fdr_intersects, columns=['discover', 'replication', 'percent'])
print(f'shape of FDR intersects data frame is {fdr_results_df.shape}')
if DEBUG:
    print(f'fdr instersects has {len(fdr_intersects)} items')
    display(fdr_intersects[:5])    
    display(fdr_results_df.head())
    display(fdr_results_df.sort_values('percent', ascending=False).head(20))

### reformat results for running regressions

In [None]:
def metric_pivot_df(df: DataFrame, metric: str, verbose: bool=False) -> DataFrame:
    ret_df = df.pivot(index='tissue', columns='feature', values=metric)
    print(f'shape of coefficients dataframe {ret_df.shape}')
    if verbose:
        display(ret_df.head())    
    return ret_df

def combine_metric_pivot_df(df1: DataFrame, df2: DataFrame, metric: str, 
                            fill_value: int=0, verbose: bool=False) -> DataFrame:
    m_df1 = metric_pivot_df(df1, metric)
    m_df2  = metric_pivot_df(df2, metric)
    ret_df = concat([m_df1, m_df2])
    # is coefficient is missing, assume no effect and set to fill_value
    # ret_df = ret_df.fillna(fill_value)        
    print(f'shape of coefficients dataframe for combined {ret_df.shape}')
    if verbose:
        display(coef_df.head())
    return ret_df

In [None]:
# fill missing p-values with non-significance, ie 1
pvalues_df = combine_metric_pivot_df(disc_df, rep_df, 'p.value', fill_value=1)
# fill missing effects with no-effect, ie 0
coef_df  = combine_metric_pivot_df(disc_df, rep_df, 'estimate', fill_value=0)
scores_df = combine_metric_pivot_df(disc_df, rep_df, 'statistic', fill_value=0)

### visualize correlations between result sets

In [None]:
def format_corr_df(df: DataFrame, verbose: bool=False) -> DataFrame:
    ret_df = DataFrame(corrcoef(df), columns=coef_df.index.values, index=df.index)
    ret_df = ret_df[~ret_df.columns.str.startswith('Frontal')]
    ret_df = ret_df.loc[:,ret_df.columns.str.startswith('Frontal')]
    print(f'shape of return dataframe {ret_df.shape}')
    if verbose:
        display(ret_df.head())
    return ret_df
        
def get_desired_index_order(df: DataFrame, verbose: bool=False) -> list:
    new_index_order = []
    for celltype in DISC_CELLTYPES:
        new_index_order.append(celltype)
        for region in DISC_REGIONS:
            this_tissue = f'{region} {celltype}'
            if df.tissue.str.contains(this_tissue).any():
                new_index_order.append(this_tissue)
    if verbose:
        print(new_index_order)
        # check nothing diff
        print(set(new_index_order) ^ set(temp.index))
    return new_index_order
        
def visualize_correlations(df: DataFrame, ordered_list: list, 
                           metric: str, corr_min: float=0.3):
    with rc_context({'figure.figsize': (18, 18)}):
        ax = heatmap(data=df.reindex(ordered_list), linecolor='white', 
                     linewidth=0.2, cmap='Purples', annot=True)
        # Filter the annotations based on a threshold
        for text in ax.texts:
            if float(text.get_text()) < corr_min:  # Set your threshold here
                text.set_text('')  # Hide the annotation if the value is less than the threshold
        plt.title(f'Correlation between Discovery and Replication {metric}', fontsize=18, fontweight='bold')
        plt.xlabel('Repliation cell-types')
        plt.ylabel('Discovery cell-types')    
        # Display the heatmap
        plt.show()        

In [None]:
index_order = get_desired_index_order(disc_df)
# for p-values
corr_df = format_corr_df(pvalues_df.fillna(1))
visualize_correlations(corr_df, index_order, 'p-values')
# for coefficients
corr_df = format_corr_df(coef_df.fillna(0))
visualize_correlations(corr_df, index_order, 'coefficients', corr_min=0.2)
# for test statistic
corr_df = format_corr_df(scores_df.fillna(0))
visualize_correlations(corr_df, index_order, 'score', corr_min=0.2)

### model comparisons of results using GLM

In [None]:
%%time
modeling_results = []
# for each discovery and replication broad cell-type pairing
for disc_tissue in index_order:
    for rep_tissue in scores_df.loc[scores_df.index.str.startswith('Frontal')].index.unique():
    # for rep_tissue in coef_df.loc[coef_df.index.str.startswith('Frontal')].index.unique():
        endo_term = disc_tissue
        exog_term = rep_tissue
        # print(endo_term, exog_term)        
        # get the features that were tested in discovery for cell-type
        disc_endo = disc_df.loc[(disc_df.tissue == endo_term) & (disc_df.fdr_bh < 0.05)]
        disc_endo = disc_df.loc[(disc_df.tissue == endo_term)]        
        # subset the regression coeficients for all those features
        metric_df = scores_df[disc_endo.feature]
        # metric_df = coef_df[disc_endo.feature]        
        # get the regression coefficients from those features for the cell-types being compared
        model_in_df = DataFrame(data=[metric_df.loc[exog_term], metric_df.loc[endo_term]]).T
        model_in_df = model_in_df.dropna()
        # model_in_df = model_in_df.fillna(0)
        # compare the results via GLM or OLS
        X_exog = sm.add_constant(model_in_df[exog_term])
        model = sm.GLM(model_in_df[endo_term], X_exog)
        model = sm.OLS(model_in_df[endo_term], X_exog)        
        results = model.fit()
        # print(results.summary())
        # print(f'coef: {results.params[exog_term]}, p-value {results.pvalues[exog_term]}')        
        modeling_results.append((endo_term, exog_term, results.params[exog_term], 
                                 results.pvalues[exog_term], results.rsquared_adj, 
                                 results.tvalues[exog_term]))

In [None]:
modeling_results_df = DataFrame(modeling_results, columns=['discovery', 'replication', 
                                                           'coefficient', 'p-value', 
                                                           'adj_r2', 'score'])
print(f'shape of modeling results data frame is {modeling_results_df.shape}')
if DEBUG:
    print(f'modeling results has {len(modeling_results)} items')
    display(modeling_results[:5])    
    display(modeling_results_df.head())
    display(modeling_results_df.sort_values('coefficient', ascending=False).head(20))
    display(modeling_results_df.sort_values('adj_r2', ascending=False).head(20))
    display(modeling_results_df.sort_values('score', ascending=False).head(20))    

#### visualize top result by adjusted R-squared

In [None]:
top_r2 = modeling_results_df.loc[modeling_results_df.adj_r2 == modeling_results_df.adj_r2.max()]
endo_term = top_r2.discovery.values[0]
exog_term = top_r2.replication.values[0]
print(endo_term, exog_term)
display(top_r2)
# get the features that were tested in discovery for cell-type
disc_endo = disc_df.loc[(disc_df.tissue == endo_term)]
# subset the regression coeficients for all those features
metric_df = coef_df[disc_endo.feature]
# get the regression coefficients from those features for the cell-types being compared
model_in_df = DataFrame(data=[metric_df.loc[exog_term], metric_df.loc[endo_term]]).T
model_in_df = model_in_df.dropna()
with rc_context({'figure.figsize': (9, 9)}):
    lmplot(data=model_in_df, x=endo_term, y=exog_term)

### visualize modeling results as heatmap

In [None]:
modeling_corr_df = modeling_results_df.pivot(index='discovery', columns='replication', values='adj_r2')
print(modeling_corr_df.shape)
if DEBUG:
    display(modeling_corr_df.head())
visualize_correlations(modeling_corr_df, index_order, 'OLS modeling Adj R2', corr_min=0.05)

In [None]:
modeling_corr_df = modeling_results_df.pivot(index='discovery', columns='replication', values='p-value')
print(modeling_corr_df.shape)
# since p-value log scale
from numpy import log10
modeling_corr_df = -log10(modeling_corr_df).round(0)
if DEBUG:
    display(modeling_corr_df.head())
visualize_correlations(modeling_corr_df, index_order, 'OLS modeling -log10(p-value)', corr_min=3)

In [None]:
modeling_corr_df = modeling_results_df.pivot(index='discovery', columns='replication', values='coefficient')
print(modeling_corr_df.shape)
if DEBUG:
    display(modeling_corr_df.head())
visualize_correlations(modeling_corr_df, index_order, 'OLS modeling coefficient', corr_min=0.2)

In [None]:
modeling_corr_df = modeling_results_df.pivot(index='discovery', columns='replication', values='score')
print(modeling_corr_df.shape)
if DEBUG:
    display(modeling_corr_df.head())
visualize_correlations(modeling_corr_df, index_order, 'OLS modeling score', corr_min=2)

In [None]:
!date