## Notebook to compare both glmmTMB and pseudobulk GLM results from the different data preps of the NABEC frontal cortex data; ie scVI normalized and scaled versus 

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame, concat
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from seaborn import lmplot
from matplotlib_venn import venn2

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase1'
set_name = f'{project}_replication'

# directories for initial setup
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
replication_dir = f'{wrk_dir}/replication'

# in files
glmpb_result_file_frmt = '{this_dir}/{name}.glm_pb_age_diffs.csv'
glmmtmb_result_file_frmt = '{this_dir}/{name}.glmmtmb_age_diffs.csv'

# out files

# constants
DEBUG = False

#### functions

In [None]:
def load_results(path: str, name: str) -> DataFrame:
    if DEBUG:
        print(name, path)
    ret_df = read_csv(path)
    # rename to glmmtmb headers to match glm_pb
    ret_df = ret_df.rename(columns={'estimate': 'coef', 'std.error': 'stderr',
                                    'statistic': 'z', 'p.value': 'p-value'})
    # replace any spaces in the tissue column
    ret_df.tissue = ret_df.tissue.str.replace(' ', '_')
    ret_df['result_set'] = name
    return ret_df

def compare_results(df: DataFrame, endog_method: str,
                    exog_method: str, metric: str):
    methods_df = df.loc[df.result_set.isin([endog_method,exog_method])]
    if DEBUG:
        print(methods_df.shape)
        display(methods_df.result_set.value_counts())
    this_piv = methods_df.pivot_table(values=metric, index=['feature', 'tissue'],
                                      columns='result_set')
    if DEBUG:
        display(this_piv.sample(5))
    print(f'missings found {this_piv.loc[this_piv[endog_method].isna()].shape}')
    # this_piv = this_piv.fillna(0)
    this_piv = this_piv.dropna()
    # this_piv = this_piv.replace([inf, -inf], 0)
    model_results = ols(formula=f'{endog_method} ~ {exog_method}',
                        data=this_piv).fit()
    print(model_results.summary())

    with rc_context({'figure.figsize': (9, 9)}):
        plt.style.use('seaborn-v0_8-talk')
        lmplot(data=this_piv, x=endog_method, y=exog_method)
        plt.title(metric)
        plt.show()


def check_results_intersection(df: DataFrame, endog_method: str, 
                               exog_method: str, max_p: float=0.05):
    endog_results = df.loc[(df.result_set == endog_method) &
                           (df.fdr_bh <= max_p)].copy()
    exog_results = df.loc[(df.result_set == exog_method) &
                          (df.fdr_bh <= max_p)].copy()
    endog_results['pair'] = endog_results.feature + ':' + endog_results.tissue
    exog_results['pair'] = exog_results.feature + ':' + exog_results.tissue
    set1 = set(endog_results.pair)
    set2 = set(exog_results.pair)
    print(len(set1 & set2))

    # Create the Venn diagram
    venn2(subsets=(len(set1 - set2), len(set2 - set1), len(set1 & set2)),
          set_labels=(endog_method, exog_method))
    plt.show()

### load results

#### load the results based on unscaled data

In [None]:
temp1_df = load_results(glmmtmb_result_file_frmt
                        .format(this_dir=f'{replication_dir}/full_h5ad_results',
                                name=set_name), 'glmmtmb_unscaled')
temp2_df = load_results(glmpb_result_file_frmt
                        .format(this_dir=f'{replication_dir}/full_h5ad_results',
                                name=set_name), 'glmpb_unscaled')
results_df = concat([temp1_df, temp2_df])

#### load the results based on the scVI normalized minmax scaled data

In [None]:
temp1_df = load_results(glmmtmb_result_file_frmt.format(this_dir=replication_dir,
                                                        name=set_name),
                        'glmmtmb_scaled')
temp2_df = load_results(glmpb_result_file_frmt.format(this_dir=replication_dir,
                                                      name=set_name),
                        'glmpb_scaled')
results_df = concat([results_df, temp1_df, temp2_df])

In [None]:
if DEBUG:
    print(results_df.shape)
    display(results_df.result_set.value_counts())
    display(results_df.sample(4))

### compare result sets

#### compare scaled to unscaled for glmmTMB results

In [None]:
compare_results(results_df, 'glmmtmb_unscaled', 'glmmtmb_scaled', 'coef')

#### compare glmmTMB to pseudobulk GLM for unscaled data

In [None]:
compare_results(results_df, 'glmmtmb_unscaled', 'glmpb_unscaled', 'coef')

#### compare glmmTMB to pseudobulk GLM for scaled data

In [None]:
compare_results(results_df, 'glmmtmb_scaled', 'glmpb_scaled', 'coef')

#### compare scaled to unscaled for pseudobulk GLM results

In [None]:
compare_results(results_df, 'glmpb_unscaled', 'glmpb_scaled', 'coef')

### check intersection of statistically significant results

In [None]:
check_results_intersection(results_df, 'glmpb_unscaled', 'glmpb_scaled')

In [None]:
check_results_intersection(results_df, 'glmmtmb_unscaled', 'glmmtmb_scaled')

In [None]:
check_results_intersection(results_df, 'glmmtmb_unscaled', 'glmpb_unscaled')

In [None]:
check_results_intersection(results_df, 'glmmtmb_scaled', 'glmpb_scaled')

In [None]:
!date