## Notebook to post-process the latent factor analysis results

In [1]:
!date

Tue Jun 11 11:40:15 EDT 2024


#### import libraries

In [20]:
from pandas import read_csv, concat, DataFrame
from pickle import load as pkl_load
from statsmodels.stats.multitest import multipletests
import numpy as np
from pickle import dump as pkl_dump

#### set notebook variables

In [3]:
# parameters
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'

# out files
assoc_file = f'{results_dir}/{project}.latent.age_glm.csv'
loadings_file = f'{results_dir}/{project}.latent.loadings.csv'
loadings_pickle = f'{results_dir}/{project}.latent.loadings.pkl'
metrics_file = f'{results_dir}/{project}.latent.metrics.csv'

# variables and constants
categories = {'curated_type': 'broad', 'cluster_name': 'specific'}
modalities = ['GEX', 'ATAC']
model_types = ['PCA', 'NMF', 'ICA']
DEBUG = True
ALPHA = 0.05

#### functions

In [4]:
def compute_bh_fdr(df: DataFrame, alpha: float=0.05, p_col: str='p-value',
                   method: str='fdr_bh', verbose: bool=True) -> DataFrame:
    ret_df = df.copy()
    test_adjust = multipletests(np.array(ret_df[p_col]), alpha=alpha, 
                                method=method)
    ret_df[method] = test_adjust[1]
    if verbose:
        print(f'total significant after correction: {ret_df.loc[ret_df[method] < alpha].shape}')
    return ret_df

#### load age associated feature results to determine cell-types that need to be loaded
get the age associated GEX and ATAC features need per cell-type

In [5]:
%%time
age_results = []
for category, prefix in categories.items():
    for modality in modalities:
        print(modality)
        in_file = f'{results_dir}/{project}.{modality}.{prefix}.glm_tweedie_fdr_filtered.age.csv'
        this_df = read_csv(in_file)
        this_df['modality'] = modality
        this_df['type'] = category
        age_results.append(this_df)
age_results_df = concat(age_results)
print(f'shape of the age results is {age_results_df.shape}')
if DEBUG:
    display(age_results_df.sample(5))
    display(age_results_df.modality.value_counts())
    display(age_results_df.groupby('type').tissue.value_counts())

GEX
ATAC
GEX
ATAC
shape of the age results is (69705, 10)


Unnamed: 0,feature,intercept,coef,stderr,z,p-value,tissue,type,fdr_bh,modality
27002,chr3:77027520-77028358,-4.322498,0.022001,0.006784,3.243137,0.001182,Astro,curated_type,0.046824,ATAC
6872,chr1:150256985-150257351,-5.291551,0.023049,0.006089,3.785594,0.000153,OPC,curated_type,0.015382,ATAC
12072,chr10:107162792-107163653,2.67522,0.019292,0.005353,3.603946,0.000313,Micro-3,cluster_name,0.038672,ATAC
24520,chr3:56862115-56862979,-2.680176,0.037245,0.00877,4.246857,2.2e-05,Astro-1,cluster_name,0.008858,ATAC
22092,chr2:218068508-218069455,-4.432717,0.01779,0.005278,3.370856,0.000749,Micro,curated_type,0.037006,ATAC


ATAC    63433
GEX      6272
Name: modality, dtype: int64

type          tissue     
cluster_name  Micro-3        10780
              Astro-1         7084
              ExN-9           3447
              ExN-14          2881
              ExN-7           2140
              OPC-4           1201
              OD-0            1147
              ExN-2           1012
              OD-21            582
              ExN-6            530
              InN-5            351
              ExN-15           339
              InN-18           326
              ExN-8            301
              PeriVasc-20      230
              InN-13           186
              InN-23           116
              InN-12           109
              ExN-19            84
              InN-10            65
              ExN-25            21
              VLMC-22           21
curated_type  Micro          14278
              Astro          10507
              OPC             6807
              ExN             2109
              OD              1916
              InN            

CPU times: user 160 ms, sys: 29 ms, total: 189 ms
Wall time: 338 ms


### load the results

#### load the age ~ latent factor association results

In [6]:
age_glm_results = []
for category, cell_types in age_results_df.groupby('type').tissue.unique().items():
    prefix = categories.get(category)
    for cell_type in cell_types:
        for mdl_type in [element.lower() for element in model_types]:
            # print(prefix, category, cell_type, this_model)
            this_file = f'{results_dir}/latents/{project}.{prefix}.{cell_type}.{mdl_type}_age_glm.csv'
            this_result = read_csv(this_file)
            this_result['type'] = prefix
            this_result['cell_type'] = cell_type
            this_result['model_type'] = mdl_type
            age_glm_results.append(this_result)
age_glm_df = concat(age_glm_results)
print(f'shape of all age GLM results is {age_glm_df.shape}')
if DEBUG:
    display(age_glm_df.sample(4))

shape of all age GLM results is (491, 8)


Unnamed: 0,feature,coef,stderr,z,p-value,type,cell_type,model_type
0,NMF_0,48.117227,6.58909,7.30256,2.823433e-13,specific,ExN-6,nmf
5,NMF_5,25.80828,11.691488,2.207442,0.02728321,specific,OD-21,nmf
1,PCA_1,-2.437423,1.685644,-1.445989,0.1481803,specific,ExN-2,pca
2,ICA_2,3.599362,6.822566,0.527567,0.5977998,broad,OD,ica


#### load the latent factors feature loadings

In [7]:
feature_loadings = {}
for category, cell_types in age_results_df.groupby('type').tissue.unique().items():
    prefix = categories.get(category)
    for cell_type in cell_types:
        for mdl_type in [element.lower() for element in model_types]:
            this_file = f'{results_dir}/latents/{project}.{prefix}.{cell_type}.{mdl_type}_loadings.pkl'
            with open(this_file, 'rb') as pkl_file:
                this_loading = pkl_load(pkl_file)
                for factor in this_loading.keys():
                    key_name = f'{cell_type}:{factor}'
                    # print(prefix, category, cell_type, this_model, factor, key_name)
                    feature_loadings[key_name] = this_loading.get(factor).copy()
print(f'{len(feature_loadings)} factor feature loadings loaded')

491 factor feature loadings loaded


#### load the selected component size model accuracy metrics 

In [8]:
mdl_metrics = []
for category, cell_types in age_results_df.groupby('type').tissue.unique().items():
    prefix = categories.get(category)
    for cell_type in cell_types:
        this_file = f'{results_dir}/latents/{project}.{prefix}.{cell_type}.latent_metrics.csv'
        this_df = read_csv(this_file, header=None)
        this_df.columns = ['type', 'cell_type', 'model_type', 'n_comp', 'R2', 'RSME']
        mdl_metrics.append(this_df)
metrics_df = concat(mdl_metrics)
# old logging may have some duplicate writing
metrics_df = metrics_df.drop_duplicates(keep='first')
print(f'shape of model accuracy metrics {metrics_df.shape}')
if DEBUG:
    display(metrics_df.sample(4))
    print('n_comp')
    display(metrics_df.groupby('type').n_comp.describe())
    print('R2')    
    display(metrics_df.groupby('type').R2.describe())
    print('RSME')    
    display(metrics_df.groupby('type').RSME.describe())
    display(metrics_df.sort_values('n_comp', ascending=False).head())

shape of model accuracy metrics (90, 6)


Unnamed: 0,type,cell_type,model_type,n_comp,R2,RSME
2,broad,InN,ICA,6,0.763,0.1095
1,specific,ExN-7,NMF,5,0.6971,0.1252
1,broad,OPC,NMF,6,0.8353,0.0919
0,specific,ExN-19,PCA,4,0.7006,0.1214


n_comp


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
broad,24.0,5.791667,0.832971,4.0,5.0,6.0,6.0,8.0
specific,66.0,5.333333,1.304823,3.0,4.0,5.0,6.0,10.0


R2


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
broad,24.0,0.775454,0.065818,0.6624,0.7258,0.7906,0.832975,0.8461
specific,66.0,0.775282,0.072721,0.6089,0.718325,0.78875,0.82495,0.8966


RSME


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
broad,24.0,0.104221,0.016273,0.0854,0.0894,0.1021,0.116625,0.1306
specific,66.0,0.106024,0.01872,0.0743,0.08825,0.10505,0.121,0.1453


Unnamed: 0,type,cell_type,model_type,n_comp,R2,RSME
1,specific,InN-13,NMF,10,0.8396,0.0782
1,broad,VLMC,NMF,8,0.8451,0.0855
1,specific,OD-21,NMF,8,0.8025,0.0997
2,broad,VLMC,ICA,7,0.8461,0.0856
0,specific,ExN-9,PCA,7,0.807,0.1009


In [9]:
metrics_df.loc[metrics_df.n_comp == 10]

Unnamed: 0,type,cell_type,model_type,n_comp,R2,RSME
1,specific,InN-13,NMF,10,0.8396,0.0782


In [10]:
metrics_df.loc[metrics_df.n_comp == 10]

Unnamed: 0,type,cell_type,model_type,n_comp,R2,RSME
1,specific,InN-13,NMF,10,0.8396,0.0782


In [11]:
metrics_df.loc[metrics_df.RSME == 0.0743]

Unnamed: 0,type,cell_type,model_type,n_comp,R2,RSME
0,specific,InN-10,PCA,6,0.8903,0.0743
2,specific,InN-10,ICA,6,0.8905,0.0743


### compute B&H FDR for the age ~ latent factor associations

In [12]:
age_glm_df['p-value'] = age_glm_df['p-value'].fillna(1)
age_glm_df = compute_bh_fdr(age_glm_df)
print(age_glm_df.shape)
if DEBUG:
    display(age_glm_df.sort_values('fdr_bh').head())

total significant after correction: (196, 9)
(491, 9)


Unnamed: 0,feature,coef,stderr,z,p-value,type,cell_type,model_type,fdr_bh
0,PCA_0,5.688729,0.511953,11.111818,1.0989950000000001e-28,specific,ExN-6,pca,5.396067e-26
1,NMF_1,-91.01509,10.883793,-8.362442,6.143308000000001e-17,specific,ExN-6,nmf,1.508182e-14
1,PCA_1,8.483076,1.059537,8.006398,1.181173e-15,specific,ExN-8,pca,1.933186e-13
2,ICA_2,15.63135,1.971145,7.930085,2.189954e-15,specific,InN-12,ica,2.688169e-13
1,NMF_1,45.051303,6.074144,7.416898,1.198953e-13,specific,InN-12,nmf,1.177372e-11


In [13]:
oi_age_glm_df = age_glm_df.loc[age_glm_df.fdr_bh <= ALPHA]
print(oi_age_glm_df.shape)
if DEBUG:
    display(oi_age_glm_df.sample(4))

(196, 9)


Unnamed: 0,feature,coef,stderr,z,p-value,type,cell_type,model_type,fdr_bh
5,NMF_5,65.254972,12.900881,5.05818,4.232775e-07,specific,InN-13,nmf,4e-06
7,NMF_7,-61.424545,22.229757,-2.763168,0.005724336,specific,InN-13,nmf,0.018863
5,NMF_5,21.946813,9.16416,2.394853,0.01662704,broad,Micro,nmf,0.044129
3,NMF_3,-68.173628,16.902966,-4.033235,5.50143e-05,specific,InN-12,nmf,0.000322


In [14]:
display(oi_age_glm_df.groupby('model_type').model_type.value_counts())
display(oi_age_glm_df.groupby(['type', 'cell_type']).model_type.value_counts())
display(oi_age_glm_df.cell_type.value_counts())

model_type  model_type
ica         ica           66
nmf         nmf           82
pca         pca           48
Name: model_type, dtype: int64

type      cell_type    model_type
broad     Astro        ica           2
                       nmf           2
                       pca           1
          ExN          nmf           3
                       ica           2
                                    ..
specific  PeriVasc-20  ica           2
                       pca           2
          VLMC-22      nmf           3
                       ica           2
                       pca           2
Name: model_type, Length: 90, dtype: int64

InN-13         13
InN             9
Micro-3         9
Micro           9
ExN-19          9
ExN-7           8
PeriVasc        8
InN-5           8
ExN-2           7
VLMC-22         7
ExN-25          7
InN-18          7
VLMC            7
InN-10          7
ExN-6           7
PeriVasc-20     7
ExN             6
ExN-8           6
ExN-15          6
InN-23          5
ExN-14          5
Astro           5
OPC-4           5
InN-12          5
OD-21           5
OD-0            5
OD              4
Astro-1         4
OPC             3
ExN-9           3
Name: cell_type, dtype: int64

In [15]:
oi_age_glm_df.loc[oi_age_glm_df.cell_type == 'InN-13'].sort_values('fdr_bh')

Unnamed: 0,feature,coef,stderr,z,p-value,type,cell_type,model_type,fdr_bh
1,PCA_1,-17.562405,2.547774,-6.893235,5.45377e-12,specific,InN-13,pca,1.681503e-10
0,PCA_0,-11.527705,1.96617,-5.863026,4.545058e-09,specific,InN-13,pca,6.563598e-08
5,NMF_5,65.254972,12.900881,5.05818,4.232775e-07,specific,InN-13,nmf,3.778714e-06
3,ICA_3,13.035231,3.372817,3.86479,0.0001111848,specific,InN-13,ica,0.0005999094
6,ICA_6,-12.491276,3.39604,-3.678189,0.0002348961,specific,InN-13,ica,0.00118901
2,NMF_2,44.519907,12.149404,3.66437,0.0002479484,specific,InN-13,nmf,0.001242272
9,NMF_9,-50.769918,14.395237,-3.526855,0.0004205264,specific,InN-13,nmf,0.00198537
8,NMF_8,233.061741,72.84751,3.19931,0.001377571,specific,InN-13,nmf,0.005368155
4,PCA_4,-34.921063,11.457876,-3.047778,0.002305401,specific,InN-13,pca,0.008510915
0,NMF_0,22.162147,7.572018,2.926848,0.00342416,specific,InN-13,nmf,0.01192385


### save the the combine results for the different result types

#### write the combined age ~ latent association results

In [16]:
age_glm_df.to_csv(assoc_file)

#### write the combined latent factor feature loadings

In [17]:
with open(loadings_pickle, 'wb') as pkl_file:
    pkl_dump(feature_loadings, pkl_file)
loadings_df = DataFrame(feature_loadings).transpose()
loadings_df.to_csv(loadings_file)

#### write the combined latent modeling metrics

In [18]:
metrics_df.to_csv(metrics_file)

In [19]:
!date

Tue Jun 11 11:40:45 EDT 2024
