## Notebook to post-process the latent factor analysis results

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat, DataFrame
from pickle import load as pkl_load
from statsmodels.stats.multitest import multipletests
import numpy as np
from pickle import dump as pkl_dump

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'

# out files
assoc_file = f'{results_dir}/{project}.latent.age_glm.csv'
loadings_file = f'{results_dir}/{project}.latent.loadings.csv'
loadings_pickle = f'{results_dir}/{project}.latent.loadings.pkl'
metrics_file = f'{results_dir}/{project}.latent.metrics.csv'

# variables and constants
categories = {'curated_type': 'broad', 'cluster_name': 'specific'}
modalities = ['GEX', 'ATAC']
model_types = ['PCA', 'NMF', 'ICA']
DEBUG = True
ALPHA = 0.05

#### functions

In [None]:
def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p-value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, 
                                method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

#### load age associated feature results to determine cell-types that need to be loaded
get the age associated GEX and ATAC features need per cell-type

In [None]:
%%time
age_results = []
for category, prefix in categories.items():
    for modality in modalities:
        print(modality)
        in_file = f'{results_dir}/{project}.{modality}.{prefix}.glm_tweedie_fdr_filtered.age.csv'
        this_df = read_csv(in_file)
        this_df['modality'] = modality
        this_df['type'] = category
        age_results.append(this_df)
age_results_df = concat(age_results)
print(f'shape of the age results is {age_results_df.shape}')
if DEBUG:
    display(age_results_df.sample(5))
    display(age_results_df.modality.value_counts())
    display(age_results_df.groupby('type').tissue.value_counts())

### load the results

#### load the age ~ latent factor association results

In [None]:
age_glm_results = []
for category, cell_types in age_results_df.groupby('type').tissue.unique().items():
    prefix = categories.get(category)
    for cell_type in cell_types:
        for mdl_type in [element.lower() for element in model_types]:
            # print(prefix, category, cell_type, this_model)
            this_file = f'{results_dir}/latents/{project}.{prefix}.{cell_type}.{mdl_type}_age_glm.csv'
            this_result = read_csv(this_file)
            this_result['type'] = prefix
            this_result['cell_type'] = cell_type
            this_result['model_type'] = mdl_type
            age_glm_results.append(this_result)
age_glm_df = concat(age_glm_results)
print(f'shape of all age GLM results is {age_glm_df.shape}')
if DEBUG:
    display(age_glm_df.sample(4))

#### load the latent factors feature loadings

In [None]:
feature_loadings = {}
for category, cell_types in age_results_df.groupby('type').tissue.unique().items():
    prefix = categories.get(category)
    for cell_type in cell_types:
        for mdl_type in [element.lower() for element in model_types]:
            this_file = f'{results_dir}/latents/{project}.{prefix}.{cell_type}.{mdl_type}_loadings.pkl'
            with open(this_file, 'rb') as pkl_file:
                this_loading = pkl_load(pkl_file)
                for factor in this_loading.keys():
                    key_name = f'{cell_type}:{factor}'
                    # print(prefix, category, cell_type, this_model, factor, key_name)
                    feature_loadings[key_name] = this_loading.get(factor).copy()
print(f'{len(feature_loadings)} factor feature loadings loaded')

#### load the selected component size model accuracy metrics 

In [None]:
mdl_metrics = []
for category, cell_types in age_results_df.groupby('type').tissue.unique().items():
    prefix = categories.get(category)
    for cell_type in cell_types:
        this_file = f'{results_dir}/latents/{project}.{prefix}.{cell_type}.latent_metrics.csv'
        this_df = read_csv(this_file, header=None)
        this_df.columns = ['type', 'cell_type', 'model_type', 'n_comp', 'R2', 'RSME']
        mdl_metrics.append(this_df)
metrics_df = concat(mdl_metrics)
# old logging may have some duplicate writing
metrics_df = metrics_df.drop_duplicates(keep='first')
print(f'shape of model accuracy metrics {metrics_df.shape}')
if DEBUG:
    display(metrics_df.sample(4))
    print('n_comp')
    display(metrics_df.groupby('type').n_comp.describe())
    print('R2')    
    display(metrics_df.groupby('type').R2.describe())
    print('RSME')    
    display(metrics_df.groupby('type').RSME.describe())
    display(metrics_df.sort_values('n_comp', ascending=False).head())

In [None]:
metrics_df.loc[metrics_df.n_comp == 10]

In [None]:
metrics_df.loc[metrics_df.n_comp == 10]

In [None]:
metrics_df.loc[metrics_df.RSME == 0.0743]

### compute B&H FDR for the age ~ latent factor associations

In [None]:
age_glm_df['p-value'] = age_glm_df['p-value'].fillna(1)
age_glm_df = compute_bh_fdr(age_glm_df)
print(age_glm_df.shape)
if DEBUG:
    display(age_glm_df.sort_values('fdr_bh').head())
    display(age_glm_df.sort_values('z').head())
    display(age_glm_df.sort_values('z').tail())

In [None]:
display(age_glm_df.groupby('model_type').model_type.value_counts())
display(age_glm_df.groupby(['type', 'cell_type']).model_type.value_counts())
display(age_glm_df.cell_type.value_counts())
for model_type in age_glm_df.model_type.unique():
    print(f'\n### {model_type} ###')
    display(age_glm_df.loc[age_glm_df.model_type == model_type].cell_type.value_counts())

In [None]:
oi_age_glm_df = age_glm_df.loc[age_glm_df.fdr_bh <= ALPHA]
print(oi_age_glm_df.shape)
if DEBUG:
    display(oi_age_glm_df.sample(4))

In [None]:
display(oi_age_glm_df.groupby('model_type').model_type.value_counts())
display(oi_age_glm_df.groupby(['type', 'cell_type']).model_type.value_counts())
display(oi_age_glm_df.cell_type.value_counts())
for model_type in oi_age_glm_df.model_type.unique():
    print(f'\n### {model_type} ###')
    display(oi_age_glm_df.loc[oi_age_glm_df.model_type == model_type].cell_type.value_counts())

In [None]:
oi_age_glm_df.loc[oi_age_glm_df.cell_type == 'InN-13'].sort_values('fdr_bh')

### save the the combine results for the different result types

#### write the combined age ~ latent association results

In [None]:
age_glm_df.to_csv(assoc_file)

#### write the combined latent factor feature loadings

In [None]:
with open(loadings_pickle, 'wb') as pkl_file:
    pkl_dump(feature_loadings, pkl_file)
loadings_df = DataFrame(feature_loadings).transpose()
loadings_df.to_csv(loadings_file)

#### write the combined latent modeling metrics

In [None]:
metrics_df.to_csv(metrics_file)

In [None]:
!date