## Perform enrichment for TF activity and gene expression in d2.5 sisters with defined fate

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import kneighbors_graph
import os

import sf_utils

In [2]:
#import clones
clone_table = pd.read_csv("../../hsc.rna&atac.r1&2_master_v2.csv", index_col=0)
clone_table_d2 = clone_table[clone_table['day']=='d2'].copy()

In [3]:
#import tf activitiy mtx
atac_cv = pd.read_csv("../../lsk_atac_reanalysis/atac_analysis/output_files/lsk_subset2/lskatac_tfact.csv", index_col=0)

In [4]:
atac_cv.columns[atac_cv.isna().any()]

Index(['Crebzf_122', 'Pou3f2_555'], dtype='object')

In [5]:
atac_cv.drop(atac_cv.columns[atac_cv.isna().any()], axis=1, inplace=True)

In [6]:
clone_table_d2.fate.unique()

array(['no_fate_cells', 'Unk_3', 'Unk_2', 'Mono', 'Neutro', 'MPP/GMP',
       'MPP', 'uns', 'MEP', 'Unk_1', 'Ery/Meg', 'Ccr7_DC',
       'Baso/Eos/Mast', 'pDC', 'Lym'], dtype=object)

In [7]:
#combine pDC and Ccr7DC
clone_table_d2.loc[clone_table_d2.fate.isin(['pDC','Ccr7_DC']),'fate'] = 'pDC/Ccr7DC'

In [8]:
tf_enrichment_table = {}
nn_impute_dict = {'Mono':0,
                  'Neutro':0,
                  'Ery/Meg':0,
                  'Baso/Eos/Mast':0,
                  'pDC/Ccr7DC':10,
                  'Lym':10}

for fate in ['Mono', 'Neutro','Ery/Meg', 'Lym', 'pDC/Ccr7DC','Baso/Eos/Mast']:
    print(fate)
    clone_curr = clone_table_d2[clone_table_d2.fate == fate].copy()
    
    if(nn_impute_dict[fate] > 0):
        print("performing knn imputation")
        
        #create k-nn graph
        atac_emb = pd.read_csv("../../lsk_atac_reanalysis/atac_analysis/output_files/lsk_subset2/lskatac_lsi.csv", index_col=0)
        knn_graph = kneighbors_graph(atac_emb,nn_impute_dict[fate])
        atac_cells = atac_emb.index
    
        # perform k-nn imputation
        cells_final = sf_utils.merge_nn(knn_graph, atac_cells, clone_curr['cell.bc'])
        
    else:
        cells_final = clone_curr['cell.bc'].copy()
    

    tf_list = sf_utils.enrich_fn(cells_final, atac_cv.sample(n=500).index,atac_cv,col1_id=fate, col2_id="others", return_sig=False)
    tf_list['tf_name'] = tf_list.index.map(lambda x:x.split("_")[0])
    tf_enrichment_table[fate] = tf_list
    

Mono


  result = getattr(ufunc, method)(*inputs, **kwargs)


Neutro


  result = getattr(ufunc, method)(*inputs, **kwargs)


Ery/Meg


  result = getattr(ufunc, method)(*inputs, **kwargs)


Lym
performing knn imputation


  result = getattr(ufunc, method)(*inputs, **kwargs)


pDC/Ccr7DC
performing knn imputation


  result = getattr(ufunc, method)(*inputs, **kwargs)


Baso/Eos/Mast


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
name_list = list(map(lambda x: "-".join(x.split("_")), tf_enrichment_table.keys()))
name_list = list(map(lambda x: "_".join(x.split("/")), name_list))

In [10]:
name_list

['Mono', 'Neutro', 'Ery_Meg', 'Lym', 'pDC_Ccr7DC', 'Baso_Eos_Mast']

In [14]:
tf_enrichment_table['Mono'].sort_values('delta', ascending=False)

Unnamed: 0,Mono,others,delta,log2fc,A,p-val,p-adj,p-adj-log,is_significant,tf_name
Atf5_791,0.736954,-0.154195,0.891149,,0.368913,6.601941e-09,1.690645e-06,5.771948,True,Atf5
Cebpg_129,0.724964,-0.147482,0.872445,,0.365962,2.826022e-07,3.040692e-05,4.517028,True,Cebpg
Cebpd_97,0.843509,0.012946,0.830564,5.821120,0.514226,1.150098e-08,1.690645e-06,5.771948,True,Cebpd
Cebpb_130,0.843509,0.012946,0.830564,5.821120,0.514226,1.150098e-08,1.690645e-06,5.771948,True,Cebpb
Cebpa_115,0.559357,-0.123496,0.682854,,0.284432,1.963168e-09,9.643910e-07,6.015747,True,Cebpa
...,...,...,...,...,...,...,...,...,...,...
Mesp1_58,0.367368,0.718631,-0.351263,-0.966236,0.625738,2.237568e-04,8.223064e-03,2.084966,True,Mesp1
Mesp2_57,0.367368,0.718631,-0.351263,-0.966236,0.625738,2.237568e-04,8.223064e-03,2.084966,True,Mesp2
Tcf3_31,0.326462,0.687783,-0.361321,-1.072705,0.591797,4.335077e-04,1.318461e-02,1.879933,True,Tcf3
Tcf12_59,0.360568,0.749239,-0.388671,-1.053094,0.636825,1.089247e-04,4.803577e-03,2.318435,True,Tcf12


In [55]:
if not os.path.exists("220208/tf_activity"):
    os.makedirs("220208/tf_activity")
    
for name_curr,dict_item in zip(name_list,tf_enrichment_table.items()):
    dict_item[1].to_csv(f"220208/tf_activity/{name_curr}_state_tfact_markers.csv")

In [57]:
tf_enrichment_table['pDC/Ccr7DC']

Unnamed: 0,pDC/Ccr7DC,others,delta,log2fc,A,p-val,p-adj,p-adj-log,is_significant,tf_name
Sfpi1_265,0.191961,-1.042593,1.234554,,-0.799158,1.724948e-06,0.000304,3.516725,True,Sfpi1
Bcl11a_795,-0.061588,-1.063597,1.002009,-4.239233,-1.192949,9.784884e-07,0.000250,3.602077,True,Bcl11a
Bcl11b_814,-0.061588,-1.063597,1.002009,-4.239233,-1.192949,9.784884e-07,0.000250,3.602077,True,Bcl11b
Spic_269,0.331737,-0.468366,0.800103,,-0.102084,7.156825e-06,0.001052,2.977962,True,Spic
Irf1_631,0.869811,0.073876,0.795935,3.527871,0.557624,1.981639e-08,0.000017,4.757507,True,Irf1
...,...,...,...,...,...,...,...,...,...,...
Gata3_384,-0.328269,0.354874,-0.683143,,0.019065,3.348895e-02,0.509263,0.293058,False,Gata3
Gata4_386,-0.303133,0.394870,-0.698002,,0.064701,1.584733e-01,0.716787,0.144610,False,Gata4
Gata2_383,-0.299565,0.464737,-0.764302,,0.114482,1.061704e-01,0.627323,0.202509,False,Gata2
Gata1_387,-0.380034,0.406139,-0.786173,,0.018709,5.503726e-02,0.566927,0.246473,False,Gata1


# Gene expr enrichment

In [24]:
gene_expr = pd.read_csv("../../lsk_rna_proc/lsk_rna_state_integrated_data.csv", index_col=0)

In [59]:
gene_expr_random_cells = np.genfromtxt("gene_expr_random_cells.txt", dtype=str)

In [60]:
gene_enrichment_table = {}
nn_impute_dict = {'Mono':0,
                  'Neutro':0,
                  'Ery/Meg':0,
                  'Baso/Eos/Mast':0,
                  'pDC/Ccr7DC':0,
                  'Lym':0}

for fate in ['Mono', 'Neutro','Ery/Meg', 'Lym', 'pDC/Ccr7DC','Baso/Eos/Mast']:
    print(fate)
    clone_curr = clone_table_d2[clone_table_d2.fate == fate].copy()
    
#     if(nn_impute_dict[fate] > 0):
#         print("performing knn imputation")
        
#         #create k-nn graph
#         atac_emb = pd.read_csv("../../lsk_atac_reanalysis/atac_analysis/output_files/lsk_subset2/lskatac_lsi.csv", index_col=0)
#         knn_graph = kneighbors_graph(atac_emb,nn_impute_dict[fate])
#         atac_cells = atac_emb.index
    
#         # perform k-nn imputation
#         cells_final = sf_utils.merge_nn(knn_graph, atac_cells, clone_curr['cell.bc'])
        
#     else:
#         cells_final = clone_curr['cell.bc'].copy()
    
    cells_final = clone_curr['cell.bc']
    gene_list = sf_utils.enrich_fn(cells_final, gene_expr_random_cells,gene_expr,col1_id=fate, col2_id="others", return_sig=True)
    gene_enrichment_table[fate] = gene_list
    

Mono


  result = getattr(ufunc, method)(*inputs, **kwargs)


Neutro


  result = getattr(ufunc, method)(*inputs, **kwargs)


Ery/Meg


  result = getattr(ufunc, method)(*inputs, **kwargs)


Lym


  result = getattr(ufunc, method)(*inputs, **kwargs)


pDC/Ccr7DC


  result = getattr(ufunc, method)(*inputs, **kwargs)


Baso/Eos/Mast


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [61]:
name_list = list(map(lambda x: "-".join(x.split("_")), gene_enrichment_table.keys()))
name_list = list(map(lambda x: "_".join(x.split("/")), name_list))

In [62]:
if not os.path.exists("220208/gene_expr"):
    os.makedirs("220208/gene_expr")
    
for name_curr,dict_item in zip(name_list,gene_enrichment_table.items()):
    dict_item[1].to_csv(f"220208/gene_expr/{name_curr}_state_geneexpr_markers.csv")