In [93]:
import pandas as pd
import numpy as np
import pickle 

In [108]:
# ENSID to gene name
infile = '../genome/hg38/hg38_ENSID_Genename_synonyms.txt.gz'
df = pd.read_csv(infile,sep='\t',usecols=[0,1])
df.replace({np.nan:'None'},inplace=True)
ID_2_GeneName = dict(zip(df['Gene stable ID'],df['Gene name']))

# load synonym to gene name dictionary
infile = '../genome/hg38/GeneName_Synonyms_dict.pkl'
with open(infile, 'rb') as f:
    Synonym_2_GeneName = pickle.load(f)


In [194]:
# Human tf list

infile = "../resources/The_Human_Transcription_Factors_Cell_2018.xlsx"
df = pd.read_excel(infile, sheet_name="Table S1. Related to Figure 1B", skiprows=1,index_col=0)
# rename column 3:
rename = {"Unnamed: 3": "Is TF"}
df = df.rename(columns=rename)

#df = df.loc[:,"Is TF"] == 'Yes'
TFs = set( np.sort( df.loc[df.loc[:,"Is TF"] == 'Yes',:].Name.unique() ) )

# correct gene names
to_add = set()
to_remove = set()
for g in TFs:
    if g in ID_2_GeneName.values():
        continue
    else:
        if g in Synonym_2_GeneName.keys():
            to_add.add(Synonym_2_GeneName[g])
        else:
            to_remove.add(g)
TFs = TFs.difference(to_remove).union(to_add)

# GO terms
go_terms = ['negative_regulation_of_transcription_by_RNA_polymerase_II','positive_regulation_of_transcription_by_RNA_polymerase_II','regulation_of_transcription_by_RNA_polymerase_II','transcription_cis-regulatory_region_binding']
infiles = [f'../resources/GO_terms/hg38/{f}.txt' for f in go_terms]

go_genes = set()
for f in infiles:
    with open(f,'r') as fin:
        for line in fin:
            go_genes.add(line.strip())

# correct gene names
to_add = set()
to_remove = set()
for g in go_genes:
    if g in ID_2_GeneName.values():
        continue
    else:
        if g in Synonym_2_GeneName.keys():
            to_add.add(Synonym_2_GeneName[g])
        else:
            to_remove.add(g)

go_genes = go_genes.difference(to_remove).union(to_add)

TFs = go_genes.union(set(TFs))

# save
outfile = '../resources/hg38_TFs.txt'
with open(outfile,'w') as fout:
    for g in TFs:
        fout.write(f'{g}\n')

In [195]:
# mouse



# Human tf list

# ENSID to gene name
infile = '../genome/mm10/mm10_ENSID_Genename_synonyms.txt.gz'
df = pd.read_csv(infile,sep='\t',usecols=[0,1])
df.replace({np.nan:'None'},inplace=True)
ID_2_GeneName = dict(zip(df['Gene stable ID'],df['Gene name']))


# load synonym to gene name dictionary
infile = '../genome/mm10/GeneName_Synonyms_dict.pkl'
with open(infile, 'rb') as f:
    Synonym_2_GeneName = pickle.load(f)

infile = "../resources/mouse_ensemble_tfs_from_lambertetal_isyes.unique.txt"
TFs = set()
with open(infile,'r') as fin:
    for line in fin:
        tf = line.strip()
        if tf in ID_2_GeneName:
            TFs.add(ID_2_GeneName[tf])
        else:
            print(f'{tf} not found in ID_2_GeneName')  

# GO terms
go_terms = ['negative_regulation_of_transcription_by_RNA_polymerase_II','positive_regulation_of_transcription_by_RNA_polymerase_II','regulation_of_transcription_by_RNA_polymerase_II','transcription_cis-regulatory_region_binding']
infiles = [f'../resources/GO_terms/mm10/{f}.txt' for f in go_terms]

go_genes = set()
for f in infiles:
    with open(f,'r') as fin:
        for line in fin:
            go_genes.add(line.strip())

# correct gene names
to_add = set()
to_remove = set()
for g in go_genes:
    if g in ID_2_GeneName.values():
        continue
    else:
        if g in Synonym_2_GeneName.keys():
            to_add.add(Synonym_2_GeneName[g])
        else:
            to_remove.add(g)

go_genes = go_genes.difference(to_remove).union(to_add)

TFs = go_genes.union(set(TFs))

# save
outfile = '../resources/hg38/TFs.txt'
with open(outfile,'w') as fout:
    for g in TFs:
        fout.write(f'{g}\n')

ENSMUSG00000118665 not found in ID_2_GeneName


In [189]:
len(go_genes)

2565

In [190]:
len(TFs)

1372

In [191]:
len(go_genes.union(set(TFs)))

2746

In [259]:
infile_chip = '../resources/experimentList_v3.tab'
all_chip = pd.read_csv(infile_chip,sep='\t',header=None,usecols=[0,1,2,3,4,5,6,7,8],index_col=0)
all_chip.columns = ['genome','antigen_class','antigen','celltype_class','celltype','celltype_description','QC','title']
all_chip = all_chip[ all_chip.antigen_class=='TFs and others' ]

In [273]:
for genome in ['mm10','hg38']:
    print(genome)
    chip = all_chip[ all_chip.genome==genome ]
    antigen = chip.antigen.unique()

    # load gene name list
    infile = f'../genome/{genome}/{genome}_ENSID_Genename_synonyms.txt.gz'
    df = pd.read_csv(infile,sep='\t',usecols=[1])
    df.replace({np.nan:'None'},inplace=True)
    GeneName = set(df['Gene name'])

    # load synonym to gene name dictionary
    infile = f'../genome/{genome}/GeneName_Synonyms_dict.pkl'
    with open(infile, 'rb') as f:
        Synonym_2_GeneName = pickle.load(f)

    to_change = []
    not_found = []
    for g in antigen:
        if g in GeneName:
            continue
        else:
            if g in Synonym_2_GeneName.keys():
                to_change.append([g,Synonym_2_GeneName[g]])
            else:
                not_found.append(g)
    
    print(f'to change: {len(to_change)}')
    print(np.array(to_change))
    print(f'not found: {len(not_found)}')
    print(np.array(not_found))

    to_change = dict(zip(np.array(to_change)[:,0],np.array(to_change)[:,1]))

    # change gene names
    for g in to_change.keys():
        idx = chip[ chip.antigen==g ].index
        for i in idx:
            chip.at[i,'antigen'] = to_change[chip.at[i,'antigen']]



mm10
to change: 12
[['H1foo' 'H1f8']
 ['Whsc1l1' 'Nsd3']
 ['Cpsf3l' 'Ints11']
 ['Fam60a' 'Sinhcaf']
 ['INO80' 'Ino80']
 ['Iho1' 'Ccdc36']
 ['Mkl1' 'Mrtfa']
 ['Mkl2' 'Mrtfb']
 ['Zfp191' 'Zfp24']
 ['Fam208a' 'Tasor']
 ['Hdgfrp2' 'Hdgfl2']
 ['Cirh1a' 'Utp4']]
not found: 25
['Epitope tags' 'Meiosin' 'GFP' 'Biotin' '5-hmC' 'ADP-ribose'
 'Crotonyl lysine' '5-mC' 'DNA-RNA hybrid' 'RFP' 'Cas9' 'BrdU' 'CEBPA'
 '8-Hydroxydeoxyguanosine' 'HBx' 'Lysin homocysteine' 'Succinyllysine'
 'O-GlcNAc' 'AML1-ETO' 'Fo' 'Dendra2' 'Hdgf2' 'Propionyllysine'
 'Butyryllysine' 'RAF1']
hg38
to change: 32
[['T' 'TBXT']
 ['CCDC101' 'SGF29']
 ['ARNTL' 'BMAL1']
 ['ZNF788' 'ZNF788P']
 ['ZCCHC11' 'TUT4']
 ['RNF219' 'OBI1']
 ['ZNF720' 'KRBOX5']
 ['PHB' 'PHB1']
 ['WHSC1' 'NSD2']
 ['C11orf53' 'POU2AF2']
 ['ZUFSP' 'ZUP1']
 ['CPSF3L' 'INTS11']
 ['WDR61' 'SKIC8']
 ['C7orf26' 'INTS15']
 ['MINA' 'RIOX2']
 ['TCEB3C' 'ELOA3P']
 ['MRE11A' 'MRE11']
 ['GUCY1B3' 'GUCY1B1']
 ['ZSCAN5D' 'ZSCAN5DP']
 ['MGEA5' 'OGA']
 ['WHSC1L1' 'NSD3']


In [271]:
chip

Unnamed: 0_level_0,genome,antigen_class,antigen,celltype_class,celltype,celltype_description,QC,title
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DRX000550,hg38,TFs and others,PARK7,Neural,SH-SY5Y,Primary Tissue=Brain|Tissue Diagnosis=Neurobla...,"8039428,33.4,42.5,165",Genome-wide analysis of DJ-1-binding regions o...
DRX001460,hg38,TFs and others,MLL-AF6,Blood,ML-2,Primary Tissue=Bone Marrow|Tissue Diagnosis=Le...,"30648533,94.0,11.0,477",Comprehensive identification and characterizat...
DRX002911,hg38,TFs and others,LRIF1,Neural,hTERT RPE-1,Primary Tissue=Eye|Tissue Diagnosis=Normal,"39861269,96.9,75.4,1489",HBiX1 ChIP-seq in hTERT-RPE1
DRX002912,hg38,TFs and others,SMCHD1,Neural,hTERT RPE-1,Primary Tissue=Eye|Tissue Diagnosis=Normal,"38935687,96.2,63.0,1817",SMCHD1 ChIP-seq in hTERT-RPE1
DRX002915,hg38,TFs and others,SMCHD1,Neural,hTERT RPE-1,Primary Tissue=Eye|Tissue Diagnosis=Normal,"32681797,97.5,45.0,992",SMCHD1 ChIP-seq in hTERT-RPE1 treated with HBi...
...,...,...,...,...,...,...,...,...
SRX9975616,hg38,TFs and others,SMC1A,Bone,A-673,Primary Tissue=Skeletal Muscle|Tissue Diagnosi...,"94607697,96.6,12.7,68149",GSM5049899: SMC1A on A673 STAG2 KO [BA A673 ST...
SRX9977681,hg38,TFs and others,LARP7,Cardiovascular,HUVEC,Primary Tissue=Umbilical Cord|Tissue Diagnosis...,"15598222,0.0,59.1,296",GSM5050367: LARP7 ChIPSeq; Homo sapiens; ChIP-Seq
SRX9977682,hg38,TFs and others,TRIM28,Cardiovascular,HUVEC,Primary Tissue=Umbilical Cord|Tissue Diagnosis...,"13137802,0.0,15.4,848",GSM5050368: TRIM28 ChIPSeq; Homo sapiens; ChIP...
SRX9989504,hg38,TFs and others,GFP,Neural,Neural progenitor cells,,"10744456,95.8,16.3,568",GSM5057432: IP NS5 rep1; Homo sapiens; ChIP-Seq


In [272]:
print( chip.loc[chip.antigen =='ARNTL'] )
print( chip.loc[chip.antigen =='BMAL1'] )

Empty DataFrame
Columns: [genome, antigen_class, antigen, celltype_class, celltype, celltype_description, QC, title]
Index: []
            genome   antigen_class antigen celltype_class       celltype  \
0                                                                          
SRX1034771    hg38  TFs and others   BMAL1           Bone           U2OS   
SRX10475531   hg38  TFs and others   BMAL1          Liver         Hep G2   
SRX10475532   hg38  TFs and others   BMAL1          Liver         Hep G2   
SRX10829233   hg38  TFs and others   BMAL1           Bone           U2OS   
SRX10829234   hg38  TFs and others   BMAL1           Bone           U2OS   
...            ...             ...     ...            ...            ...   
SRX8151645    hg38  TFs and others   BMAL1      Adipocyte  Preadipocytes   
SRX8151646    hg38  TFs and others   BMAL1      Adipocyte  Preadipocytes   
SRX8151647    hg38  TFs and others   BMAL1      Adipocyte  Preadipocytes   
SRX8151648    hg38  TFs and others   

In [250]:
chip.loc[chip.antigen =='ARNTL']
g = 'ARNTL'
idx = chip.loc[chip.antigen == g]
chip.loc[idx,'antigen'] = to_change[g]
chip.loc[chip.antigen =='ARNTL']

KeyError: "None of [Index([                                                                      ('g', 'e', 'n', 'o', 'm', 'e'),\n                                          ('a', 'n', 't', 'i', 'g', 'e', 'n', '_', 'c', 'l', 'a', 's', 's'),\n                                                                        ('a', 'n', 't', 'i', 'g', 'e', 'n'),\n                                     ('c', 'e', 'l', 'l', 't', 'y', 'p', 'e', '_', 'c', 'l', 'a', 's', 's'),\n                                                                   ('c', 'e', 'l', 'l', 't', 'y', 'p', 'e'),\n       ('c', 'e', 'l', 'l', 't', 'y', 'p', 'e', '_', 'd', 'e', 's', 'c', 'r', 'i', 'p', 't', 'i', 'o', 'n'),\n                                                                                                 ('Q', 'C'),\n                                                                                  ('t', 'i', 't', 'l', 'e')],\n      dtype='object', name=0)] are in the [index]"

In [288]:
infile = '../resources/hg38/TF_list.txt'
with open(infile,'r') as f:
    TFs = [tf.strip() for tf in f.readlines()]

In [289]:
TFs

['NRIP2',
 'ZNF578',
 'MYRF',
 'TRIM29',
 'TEAD2',
 'ASH2L',
 'ST3GAL4',
 'CHP2',
 'SSBP3',
 'ZNF681',
 'RBBP8',
 'HOXD3',
 'ZNF787',
 'ZNF576',
 'CRLF3',
 'BUD31',
 'MBD3',
 'ZNF23',
 'SOX4',
 'RIGI',
 'HMX1',
 'JMY',
 'ZNF792',
 'LARP7',
 'SNAPC5',
 'TXNIP',
 'PTOV1',
 'DMRTA1',
 'ZNF492',
 'ZNF287',
 'DMRT3',
 'CEBPZ',
 'PGBD1',
 'TLX3',
 'JAZF1',
 'MSX2',
 'IRF7',
 'XRCC6',
 'EMC2',
 'SEPTIN3',
 'KLF6',
 'ZNF77',
 'WAS',
 'PTMA',
 'NKX3-2',
 'IRF2',
 'ABLIM3',
 'MBD3L2B',
 'ZNF253',
 'RPAP2',
 'MED5',
 'STAT2',
 'TFAP2E',
 'CBFB',
 'NUCKS1',
 'ZNF397',
 'RB1',
 'THAP10',
 'NACC1',
 'TFDP1',
 'BAHD1',
 'PI',
 'CDX4',
 'LIN54',
 'MAGEB17',
 'ATM',
 'TAL2',
 'TARBP1',
 'ZSCAN5DP',
 'ZSCAN9',
 'CRK',
 'ZNF239',
 'TBL1XR1',
 'ZNF229',
 'SNAPC4',
 'SUMO2',
 'SP2',
 'HIF3A',
 'GLMP',
 'ZBTB16',
 'ZNF763',
 'CCNT2',
 'PLK1',
 'MLX',
 'ZNF500',
 'ZNF490',
 'CUX2',
 'SATB2',
 'CCNL1',
 'TRAFD1',
 'ZNF665',
 'ZNF355P',
 'ZNF415',
 'NFIB',
 'ARM',
 'ZNF841',
 'YEATS2',
 'ARID2',
 'LEFTY1',
 'T

In [293]:
on = 0
off = 0
for g in chip.antigen.unique():
    if g in np.array(TFs):
        on += 1
    else:
        off += 1

In [294]:
print(on,off)

1426 414


In [297]:
infile = '../genome/mm10/mm10_ENSID_Genename_synonyms.txt.gz'
df = pd.read_csv(infile,sep='\t')
df.replace({np.nan:'None'},inplace=True)
df.drop(df.index[df['Gene Synonym']=="None"],inplace=True)
df.drop_duplicates(inplace=True)
df.drop( df.loc[df['Gene Synonym'] == df['Gene name']].index, inplace=True)

In [310]:
df

Unnamed: 0,Gene stable ID,Gene name,Gene Synonym
0,ENSMUSG00000064372,mt-Tp,tRNA
1,ENSMUSG00000064372,mt-Tp,tRNA-Pro
2,ENSMUSG00000064372,mt-Tp,TrnP tRNA
3,ENSMUSG00000064371,mt-Tt,tRNA
4,ENSMUSG00000064371,mt-Tt,tRNA-Thr
...,...,...,...
82453,ENSMUSG00000116485,Mir7679,mmu-mir-7679
82459,ENSMUSG00000116523,Gm12185,Irgb2b1
82465,ENSMUSG00000115991,Slfn2,Shlf2
82474,ENSMUSG00000116327,Slfn14,LOC237890
