# CRM target mpra genes

Point 14: “It would be informative to include a new figure including a gene ontology and/or pathway analysis of all TFs identified from the motif analysis. This would represent an informative big picture analysis of all pathways associated with psychiatric disease. Analysis of TF motifs disrupted by the identified daSNVs should include RNA-seq (or qPCR), demonstrating the relative expression of these TFs in neurons and astrocytes. Are top TFs expressed in the fetal/adult post-mortem brain? Perhaps a cell-type-specific or developmental analysis of TF and target gene expression might inform when these different aspects of disease risk exert their effects? Does transcriptomic imputation (e.g.TWAS/PrediXcan) predict an enrichment of TF gene expression in psychiatric disease?”

With regards to the linking TF target genes, we will link these TFs to putative target genes by via our matched RNA, ATAC, and HiChIP dataset, and finally complete the analysis by describing neuropsychiatric disease relevance of these TF targeted genes. These analyses greatly strengthen the main messages of the work. 

In [13]:
import pandas as pd
import numpy as np
import os, glob, sys

In [25]:
tfs = pd.read_csv('../data/external/HOCOMOCOv11_annotation.csv').tf.values
tfs

array(['AHR', 'AIRE', 'ALX1', 'ALX3', 'ALX4', 'AR', 'AR', 'AR', 'TFAP2A',
       'TFAP2B', 'TFAP2C', 'TFAP2D', 'ARID3A', 'ARID5B', 'ARNT2', 'ARNT',
       'ARX', 'ASCL1', 'ASCL2', 'ATF1', 'ATF2', 'ATF2', 'ATF2', 'ATF3',
       'ATF4', 'ATF6', 'ATF7', 'ATOH1', 'BACH1', 'BACH2', 'BARHL1',
       'BARHL2', 'BARX1', 'BARX2', 'BATF3', 'BATF', 'BATF', 'BCL11A',
       'BCL6B', 'BCL6', 'BHLHA15', 'BHLHE22', 'BHLHE23', 'BHLHE40',
       'BHLHE41', 'ARNTL', 'BPTF', 'TBXT', 'TBXT', 'BRCA1', 'BSX',
       'CDC5L', 'CDX1', 'CDX2', 'CEBPA', 'CEBPB', 'CEBPD', 'CEBPE',
       'CEBPG', 'CEBPZ', 'CENPB', 'CLOCK', 'EBF1', 'NR2F1', 'NR2F1',
       'NR2F2', 'NR2F2', 'CPEB1', 'CREB3L1', 'CREB3L2', 'CREB1', 'CREB3',
       'CREB5', 'CREM', 'CRX', 'CTCFL', 'CTCF', 'CUX1', 'CUX2', 'CXXC1',
       'DBP', 'DDIT3', 'DLX1', 'DLX2', 'DLX3', 'DLX4', 'DLX5', 'DLX6',
       'DMBX1', 'DMRT1', 'DPRX', 'DRGX', 'DUX4', 'DUXA', 'E2F1', 'E2F2',
       'E2F3', 'E2F4', 'E2F4', 'E2F5', 'E2F6', 'E2F7', 'E2F8', 'E4F1',
       '

In [26]:
all_mpra_egenes_mpra = pd.read_table('/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/MPRA_022022/mpraanalyze/egene_gtex/all.txt', header=None)
all_mpra_egenes_mpra.columns = ['genes']
all_mpra_egenes_mpra

Unnamed: 0,genes
0,AADAT
1,ABCB9
2,ABCC8
3,ABHD16A
4,ABT1
...,...
424,ZSCAN12
425,ZSCAN23
426,ZSCAN26
427,ZSCAN31


In [27]:
all_mpra_egenes_mpra['is_tf'] = all_mpra_egenes_mpra.genes.apply(lambda x: x in tfs)
all_mpra_egenes_mpra

Unnamed: 0,genes,is_tf
0,AADAT,False
1,ABCB9,False
2,ABCC8,False
3,ABHD16A,False
4,ABT1,False
...,...,...
424,ZSCAN12,False
425,ZSCAN23,False
426,ZSCAN26,False
427,ZSCAN31,True


In [39]:
mpra_tf_list = all_mpra_egenes_mpra.genes.to_list()
mpra_tf_list_col = []
for tf in mpra_tf_list:
    mpra_tf_list_col.append(tf+'_pro')
    mpra_tf_list_col.append(tf+'_loop')

In [28]:
all_mpra_egenes_mpra.is_tf.value_counts()

False    413
True      16
Name: is_tf, dtype: int64

In [30]:
mpra_df = all_mpra_egenes_mpra[all_mpra_egenes_mpra.is_tf]

In [31]:
%%time
crm_df = pd.read_csv("../data/processed/tissue_crms/all_count_comb_overall.csv")
crm_df.set_index('Unnamed: 0',inplace=True)

In [64]:
mpra_tf_list_col_filt = [x for x in mpra_tf_list_col if x in crm_df.columns]
mpra_tf_list_filt = sorted(set([x.split('_')[0] for x in mpra_tf_list_col_filt]))
print(mpra_tf_list_filt)
mpra_tf_list_col_filt

['ARNT', 'IRF3', 'MAFA', 'NFATC3', 'NFKB1', 'NR1H3', 'PBX1', 'PBX2', 'POU5F1', 'RFX2', 'SHOX2', 'SREBF1', 'TEF', 'ZKSCAN3', 'ZNF322', 'ZSCAN31']


['ARNT_pro',
 'ARNT_loop',
 'IRF3_pro',
 'IRF3_loop',
 'MAFA_pro',
 'MAFA_loop',
 'NFATC3_pro',
 'NFATC3_loop',
 'NFKB1_pro',
 'NFKB1_loop',
 'NR1H3_pro',
 'NR1H3_loop',
 'PBX1_pro',
 'PBX1_loop',
 'PBX2_pro',
 'PBX2_loop',
 'POU5F1_pro',
 'POU5F1_loop',
 'RFX2_pro',
 'RFX2_loop',
 'SHOX2_pro',
 'SHOX2_loop',
 'SREBF1_pro',
 'SREBF1_loop',
 'TEF_pro',
 'TEF_loop',
 'ZKSCAN3_pro',
 'ZKSCAN3_loop',
 'ZNF322_pro',
 'ZNF322_loop',
 'ZSCAN31_pro',
 'ZSCAN31_loop']

In [50]:
crm_df[:5]

Unnamed: 0_level_0,tissue,exp,num_loop_counts,num_loops,num_atac_regions_pro,num_atac_regions_loop,AHR_pro,AR_pro,ARID3A_pro,ARID5B_pro,...,HOXC10_loop,LMX1A_loop,DLX3_pro,RAX_pro,DLX3_loop,RAX_loop,DLX2_pro,TAL1_pro,DLX2_loop,TAL1_loop
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,Astrocytes,18.66,287.0,94.0,0.0,251.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG-AS1,Astrocytes,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,Astrocytes,0.01,140.0,79.0,0.0,36.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,Astrocytes,50.67,77.0,54.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,Astrocytes,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
print(crm_df.shape)
crm_df.loc[:,mpra_tf_list_col_filt]

(213174, 1157)


Unnamed: 0_level_0,ARNT_pro,ARNT_loop,IRF3_pro,IRF3_loop,MAFA_pro,MAFA_loop,NFATC3_pro,NFATC3_loop,NFKB1_pro,NFKB1_loop,...,SREBF1_pro,SREBF1_loop,TEF_pro,TEF_loop,ZKSCAN3_pro,ZKSCAN3_loop,ZNF322_pro,ZNF322_loop,ZSCAN31_pro,ZSCAN31_loop
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
A1BG-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZYG11B,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZZEF1,0.0,2.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,2.0,...,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0


In [68]:
mpra_tf_list_filt

['ARNT',
 'IRF3',
 'MAFA',
 'NFATC3',
 'NFKB1',
 'NR1H3',
 'PBX1',
 'PBX2',
 'POU5F1',
 'RFX2',
 'SHOX2',
 'SREBF1',
 'TEF',
 'ZKSCAN3',
 'ZNF322',
 'ZSCAN31']

In [67]:
for tf in mpra_tf_list_filt:
    print('((()))')
    print(tf)
    genes_target_pro = list(set(crm_df.index[(crm_df[tf+'_pro']>0) & (crm_df.exp>1)]))
    genes_target_loop = list(set(crm_df.index[(crm_df[tf+'_loop']>0) & (crm_df.exp>1)]))
    print(len(genes_target_pro),len(genes_target_loop))
    with open('../data/processed/mpra_egenes/'+tf+'.txt','w') as f:
        for x in genes_target_pro:
            f.write(x+'\n')
#     for x in genes_target_pro:
#         print(x)
#     raise
    

((()))
ARNT
52 10552
((()))
IRF3
133 12976
((()))
MAFA
26 6750
((()))
NFATC3
24 9036
((()))
NFKB1
59 10691
((()))
NR1H3
121 12706
((()))
PBX1
85 12066
((()))
PBX2
20 6128
((()))
POU5F1
43 10370
((()))
RFX2
114 12384
((()))
SHOX2
9 4289
((()))
SREBF1
74 11604
((()))
TEF
17 6185
((()))
ZKSCAN3
29 7253
((()))
ZNF322
65 10382
((()))
ZSCAN31
102 11714
