## Summarize some of the latent factor metrics and compare the latent factors between factor and cell-types

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat, DataFrame
from seaborn import barplot, scatterplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import statsmodels.api as sm
from pandas import DataFrame
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
from statsmodels.stats.multitest import multipletests

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'

# in files
assoc_file = f'{results_dir}/{project}.latent.age_glm.csv'
metrics_file = f'{results_dir}/{project}.latent.metrics.csv'

# out files
results_file = f'{results_dir}/{project}.associated_latent_factors.csv'

# variables and constants
categories = {'curated_type': 'broad', 'cluster_name': 'specific'}
modalities = ['GEX', 'ATAC']
model_types = ['PCA', 'NMF', 'ICA']
DEBUG = False
ALPHA = 0.05
DPI = 100

### load the input files

#### load the summary metrics for the latent factors

In [None]:
factor_metrics = read_csv(metrics_file, index_col=0)
print(f'shape of factor_metrics is {factor_metrics.shape}')
if DEBUG:
    display(factor_metrics.sample(5))

#### load the latent factor GLM age association results

In [None]:
age_glm = read_csv(assoc_file, index_col=0)
print(f'shape of age_glm is {age_glm.shape}')
if DEBUG:
    display(age_glm.sample(4))

### visualize the accuracy of the latent models

#### by number of components select

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=factor_metrics.sort_values('n_comp', ascending=False),
            x='cell_type', y='n_comp', hue='model_type', palette='colorblind')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title('Number of components selected for model types')
    plt.xlabel('Cell types')
    plt.ylabel('Number of components')
    plt.show()

#### by R-squared

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=factor_metrics.sort_values('R2', ascending=False),
            x='cell_type', y='R2', hue='model_type', palette='colorblind')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title('Accuracy of model types, R-squared')
    plt.xlabel('Cell types')
    plt.show()    

#### by RMSE

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=factor_metrics.sort_values('RSME', ascending=True),
            x='cell_type', y='RSME', hue='model_type', palette='colorblind')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title('Accuracy of model types, RMSE')  
    plt.xlabel('Cell types')
    plt.show()

### visualize the latent factors associated with age

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    scatterplot(data=age_glm.loc[age_glm.fdr_bh <= ALPHA], 
                x='coef', y='z', hue='model_type', palette='colorblind')
    plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right', borderaxespad=0)
    plt.tight_layout()
    plt.show()

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    scatterplot(data=age_glm.loc[age_glm.fdr_bh <= ALPHA], 
                x='coef', y='z', hue='cell_type', palette='colorblind', style='model_type')
    plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right', borderaxespad=0, ncol=1, fontsize=9)
    plt.tight_layout()
    plt.show()

In [None]:
age_factor_counts = (age_glm.loc[age_glm.fdr_bh <= ALPHA]
                     .groupby(['cell_type', 'model_type'])
                     .count().sort_values('feature', ascending=False))
if DEBUG:
    display(age_factor_counts)

with rc_context({'figure.figsize': (15, 11), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=age_factor_counts,
            x='cell_type', y='feature', hue='model_type', palette='colorblind')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title('Number of components selected for model types that are age associated')
    plt.xlabel('Cell types')
    plt.ylabel('Number of components')
    plt.show()    

### identify which latent factors that are age associated are well correlated across cell and model types

#### load, label, and combine the latent factors into a dataframe

In [None]:
age_assoc_glm = age_glm.loc[age_glm.fdr_bh <= ALPHA]
print(f'shape of age_assoc_glm {age_assoc_glm.shape}')
if DEBUG:
    display(age_assoc_glm.sample(5))
    display(age_assoc_glm.groupby('cell_type').model_type.value_counts().sort_values(ascending=False))

In [None]:
factors = []
for category, cell_types in age_assoc_glm.groupby('type').cell_type.unique().items():
    for cell_type in cell_types:
        for mdl_type in [element.lower() for element in model_types]:
            # print(category, cell_type, mdl_type)
            this_file = f'{results_dir}/latents/{project}.{category}.{cell_type}.{mdl_type}_components.csv'
            this_factors = read_csv(this_file, index_col=0)
            this_factors = this_factors.add_prefix(f'{cell_type}:')
            factors.append(this_factors)
# now convert the list of factor dataframes into single dataframe
factors_df = concat(factors, axis='columns')
# scale for interpretability
factors_df = DataFrame(data=MinMaxScaler().fit_transform(factors_df), 
                       columns=factors_df.columns, index=factors_df.index)
print(f'shape of factors_df is {factors_df.shape}')
if DEBUG:
    display(factors_df.sample(5))

#### create list of pairings to run regressions for

In [None]:
pairings = list(combinations(factors_df.columns, 2))

In [None]:
len(pairings)

#### regress the pairings

In [None]:
%%time
def regress_pair(endog_name: str, exog_name: str, data: DataFrame) -> tuple:
    ret_list = None
    if not endog_name == exog_name:
        endog = data[endog_name].values
        exog = sm.add_constant(data[exog_name].values)
        try:
            result = sm.GLM(endog, exog).fit()
            ret_list = [endog_name, exog_name, 
                        result.params[1], result.bse[1], 
                        result.tvalues[1], result.pvalues[1]]
        except:
                print(f'Caught Error for {endog_name} ~ {exog_name}')
                ret_list = [endog_name] + [exog_name] + [np.nan] * 4        
        return ret_list

results = [regress_pair(endog_name, exog_name, 
                        factors_df[[endog_name, exog_name]].dropna()) 
           for endog_name, exog_name in pairings]

#### convert regression results into a dataframe

In [None]:
results_df = DataFrame(data=results, 
                       columns=['endog', 'exog', 'coef', 'stderr', 
                                'z', 'p-value'])
print(f'shape of results_df is {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))

#### compute the FDR values

In [None]:
def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p-value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, 
                                method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

In [None]:
results_df['p-value'] = results_df['p-value'].fillna(1)
results_df = compute_bh_fdr(results_df)
print(f'shape of results_df is {results_df.shape}')
if DEBUG:
    display(results_df.sort_values(['fdr_bh']).head())

#### save the results

In [None]:
results_df.to_csv(results_file)

#### visualize a random result

In [None]:
random_result = results_df.loc[results_df.fdr_bh <= ALPHA].sample(n=1).iloc[0]
print(random_result)

with rc_context({'figure.figsize': (9, 9), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    scatterplot(data=factors_df, x=random_result.exog , y=random_result.endog)

#### which ones are shared across different cell-types

In [None]:
diff_celltypes = results_df.loc[(results_df.fdr_bh <= ALPHA) & 
                                 results_df.endog.str.startswith('ExN') & 
                                 ~results_df.exog.str.startswith('ExN')].sort_values('z')
if DEBUG:
    display(diff_celltypes.head())

In [None]:
random_result = diff_celltypes.loc[diff_celltypes.fdr_bh <= ALPHA].sample(n=1).iloc[0]
print(random_result)

with rc_context({'figure.figsize': (9, 9), 'figure.dpi': DPI}):
    plt.style.use('seaborn-v0_8-talk')
    scatterplot(data=factors_df, x=random_result.exog , y=random_result.endog)

In [None]:
!date