# 8B_footprint_target_genes

7/07/2021

Can I get the gene lists for TF footprints of ATF4 and CEBPG in KCD6 and Prostate pro-loops? 


In [10]:
import pybedtools
import glob, os
import pandas as pd

save_dir = '../data/processed/fig7_vocab_go/'
vocab_geneset_dir = '../data/processed/fig7_vocab_go/vocab_geneset_all_tissue'


In [11]:
# tfs 
tf_annon_df = pd.read_csv('../data/external/HOCOMOCOv11_annotation.csv',index_col=0)
tf_annon_df['id_trim'] = tf_annon_df['id'] + '.pwm.trim'
tf_name_to_id_dict = pd.Series(tf_annon_df.id_trim.values, index=tf_annon_df.tf.values).to_dict()
tf_id_to_name_dict = pd.Series(tf_annon_df.tf.values, index=tf_annon_df.id_trim.values).to_dict()

In [12]:
def get_instances(vocab, tissue ,anc_prom, loop_df, anc_foot, pro_anc_foot_df, gene=None,verbose=False):
    if verbose:
        print('*** getting instances for gene: ', gene)
        print('anc_prom', anc_prom.shape)
    
    # get anchors that loop to promoter anchors
    if gene is not None:
        anc_prom_filt = anc_prom[anc_prom.name==gene]
    else:
        anc_prom_filt = anc_prom
    anc_prom_arr = anc_prom_filt.thickEnd.unique()
    if verbose:
        print('num promoter anchors', len(anc_prom_arr))
    
    # get loops
    loop_df_filt = loop_df[loop_df.source.isin(anc_prom_arr)|loop_df.target.isin(anc_prom_arr)]
    all_anc = sorted(set(anc_prom_filt.thickEnd.to_list() +loop_df_filt.source.to_list() + loop_df_filt.target.to_list() ))
    if verbose:
        print('num all anchors', len(all_anc))#all_anc)
        display(loop_df)
    
    # get footprinting anchor data
    anc_foot_filt = anc_foot[anc_foot.name.isin(all_anc)]
    if verbose:
        print('number of footprinted anchors with vocab',anc_foot.shape)
    
#     # get promoter footprint data
#     pro_anc_foot = pd.DataFrame()
#     if gene is not None:
#         pro_anc_foot1 = pro_anc_foot_df[(pro_anc_foot_df.name==gene)]# not perfect will catch later???
#         pro_anc_foot = pd.concat([pro_anc_foot, pro_anc_foot1])

    genomic_instances = anc_foot_filt#pd.concat([anc_foot_filt,pro_anc_foot])
    genomic_instances.columns = ['chr', 'start', 'stop', 'TSS', 'chr_m', 'start_m', 'stop_m', 'id_trim',
       'score', 'strand', 'qual','tf']
    genomic_instances.drop('qual',axis=1,inplace=True)
    genomic_instances['vocab'] = vocab
    genomic_instances['tissue'] = tissue
    genomic_instances['config'] = 'pro_loop'
    genomic_instances['gene'] = gene
    return genomic_instances
    

def get_genomic_instances_pro_loop(vocab, tissue, list_genes,verbose=False, 
                          tf_id_to_name_dict=tf_id_to_name_dict,
                          save_dir=None):
    # get genes with this vocab (any configuraton)
    if verbose:
        print('=======================')
        print( vocab, tissue)
        print(list_genes)
        print(len(list_genes))
        print('loading files...')
    # get files
    anc_prom = pybedtools.BedTool('../data/interim/annon/promoter_anchors/promoter_'+tissue+'_annon.bed').to_dataframe()
    loop_df = pd.read_csv('../data/interim/merged/loops/'+tissue+'.loops.csv',index_col=0)
    anc_foot = pybedtools.BedTool('../data/interim/annon/anchor_footprinting/'+tissue+'_annon.bed').to_dataframe()
    anc_foot['tf'] = anc_foot.thickEnd.map(tf_id_to_name_dict)
    anc_foot = anc_foot[anc_foot.tf.isin(vocab.split('::'))]
    pro_anc_foot_df = pybedtools.BedTool('../data/interim/annon/promoter_footprinting/promoter_'+tissue+'_annon.bed').to_dataframe()
    pro_anc_foot_df['tf'] = pro_anc_foot_df.thickEnd.map(tf_id_to_name_dict)

    genomic_instances_all = pd.DataFrame()
    
    if list_genes is not None:
        for gene in list_genes:
 
            genomic_instances = get_instances(vocab, tissue, anc_prom, loop_df, anc_foot, pro_anc_foot_df, gene,verbose=verbose)
            genomic_instances_all = pd.concat([genomic_instances_all, genomic_instances])
    else:
        genomic_instances_all = get_instances(vocab, tissue, anc_prom, loop_df, anc_foot, pro_anc_foot_df, gene,verbose=verbose)


    
    if verbose:
        print('*** genomic instances',genomic_instances_all.shape)
        display(genomic_instances_all)
    if save_dir is not None:
        genomic_instances_all.to_csv(os.path.join(save_dir, '_'.join(vocab.split('::'))+'_genomic_instances.csv'))
    return genomic_instances_all



def get_target_genes(tf,tissue, not_tissues=None, verbose=False):
    vocab_df = pd.read_csv('../data/processed/tissue_crms/pro_loop_tissue/'+tissue+'_crm.csv',index_col=0)
    target_genes =  vocab_df.index[vocab_df[tf+'_loop']>0].values
    print(len(target_genes), 'target genes found for', tf, 'in tissue: ', tissue)
    if not_tissues is not None:
        not_target_genes = []
        for t in not_tissues:
            not_vocab_df = pd.read_csv('../data/processed/tissue_crms/pro_loop_tissue/'+t+'_crm.csv',index_col=0)
            not_target_genes += not_vocab_df.index[not_vocab_df[tf+'_loop']>0].to_list()
        not_target_genes = set(not_target_genes)
        print(len(not_target_genes), 'off-target genes found for', tf, 'in tissues: ', not_tissues)
    target_genes =  sorted(set(target_genes) - not_target_genes)
    print(len(target_genes), 'filtered target genes found for', tf, 'in tissue: ', tissue)
    
    # get genomic instances
    df_all_instances = get_genomic_instances_pro_loop(tf, tissue, list_genes=target_genes,verbose=verbose)
    print(len(df_all_instances.gene.unique()), 'final target genes with instances found for', tf, 'in tissue: ', tissue)

#     df = df_all_instances[~df_all_instances.TSS.str.startswith('chr')]
#     if len(df)>0:
#         print(len(df.TSS.unique()), 'final target genes with instances found for', tf, 'in tissue: ', tissue)
#     else:
#         print('no target genes found for ', tf, 'in tissue: ',tissue)
#     print(sorted(df.TSS.unique()))
    print('done...')
    return df_all_instances

<!-- Can you get me the footprint DNA sequence and genomic location of the footrpint CEBPG and ATF4 for STK11? It potentially can work...

 -->
 
 basically what we want is we want target genes with loops in one tissue (KCD6) and not in other tissue (prostate) and vice versa where the differential loop in both cases uses the same motif

In [4]:
# get_genomic_instances_pro_loop('CEBPG', 'Prostate', list_genes=['STK11'])
# get_genomic_instances_pro_loop('ATF4', 'Prostate', list_genes=['STK11'])

In [5]:
atf4_kcd6.gene.unique().shape

NameError: name 'atf4_kcd6' is not defined

In [None]:
atf4_kcd6 = get_target_genes('ATF4', 'GDSD6', not_tissues=['Prostate'])
atf4_pros = get_target_genes('ATF4', 'Prostate', not_tissues=['GDSD6'])

In [6]:
cebpg_kcd6 = get_target_genes('CEBPG', 'GDSD6', not_tissues=['Prostate'])
cebpg_pros = get_target_genes('CEBPG', 'Prostate', not_tissues=['GDSD6'])

1065 target genes found for CEBPG in tissue:  GDSD6
376 off-target genes found for CEBPG in tissues:  ['Prostate']
887 filtered target genes found for CEBPG in tissue:  GDSD6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus

887 final target genes with instances found for CEBPG in tissue:  GDSD6
done...
376 target genes found for CEBPG in tissue:  Prostate
1065 off-target genes found for CEBPG in tissues:  ['GDSD6']
198 filtered target genes found for CEBPG in tissue:  Prostate
198 final target genes with instances found for CEBPG in tissue:  Prostate
done...


In [52]:
smad4_kcd6 = get_target_genes('SMAD4', 'GDSD6', not_tissues=['Prostate'])
smad4_pros = get_target_genes('SMAD4', 'Prostate', not_tissues=['GDSD6'])

764 target genes found for SMAD4 in tissue:  GDSD6
456 off-target genes found for SMAD4 in tissues:  ['Prostate']
566 filtered target genes found for SMAD4 in tissue:  GDSD6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

566 final target genes with instances found for SMAD4 in tissue:  GDSD6
done...
456 target genes found for SMAD4 in tissue:  Prostate
764 off-target genes found for SMAD4 in tissues:  ['GDSD6']
258 filtered target genes found for SMAD4 in tissue:  Prostate
258 final target genes with instances found for SMAD4 in tissue:  Prostate
done...


In [53]:
atf4_kcd6.to_csv('../data/processed/fig7_vocab_go/genomic_instance_overlap/ATF4_kcd6_manual.csv')
atf4_pros.to_csv('../data/processed/fig7_vocab_go/genomic_instance_overlap/ATF4_prostate_manual.csv')
cebpg_kcd6.to_csv('../data/processed/fig7_vocab_go/genomic_instance_overlap/CEBPG_kcd6_manual.csv')
cebpg_pros.to_csv('../data/processed/fig7_vocab_go/genomic_instance_overlap/CEBPG_prostate_manual.csv')
smad4_kcd6.to_csv('../data/processed/fig7_vocab_go/genomic_instance_overlap/SMAD4_kcd6_manual.csv')
smad4_pros.to_csv('../data/processed/fig7_vocab_go/genomic_instance_overlap/SMAD4_prostate_manual.csv')

In [33]:
vocab = 'ARNT::SP1'
df =get_genomic_instances_pro_loop(vocab, 'SCC13-CTRLi', ['CYP1B1'])
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

Unnamed: 0,chr,start,stop,TSS,chr_m,start_m,stop_m,id_trim,score,strand,tf,vocab,tissue,config,gene
1222741,chr2,37550000,37555000,chr2_37550000_37555000,chr2,37551938,37551959,SP1_HUMAN.H11MO.0.A.pwm.trim,12.651939,-,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223261,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898508,37898518,SP1_HUMAN.H11MO.1.A.pwm.trim,10.291843,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223298,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898822,37898843,SP1_HUMAN.H11MO.0.A.pwm.trim,22.014516,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223307,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898823,37898833,SP1_HUMAN.H11MO.1.A.pwm.trim,15.157784,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223321,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898825,37898846,SP1_HUMAN.H11MO.0.A.pwm.trim,7.886173,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223327,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898827,37898848,SP1_HUMAN.H11MO.0.A.pwm.trim,17.743111,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223332,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898828,37898838,SP1_HUMAN.H11MO.1.A.pwm.trim,12.310072,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223340,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898830,37898851,SP1_HUMAN.H11MO.0.A.pwm.trim,7.260751,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223344,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898832,37898853,SP1_HUMAN.H11MO.0.A.pwm.trim,16.88597,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1
1223347,chr2,37895000,37900000,chr2_37895000_37900000,chr2,37898833,37898843,SP1_HUMAN.H11MO.1.A.pwm.trim,12.68293,+,SP1,ARNT::SP1,SCC13-CTRLi,pro_loop,CYP1B1


In [35]:
prom_anc_arr = ['chr2_38320000_38325000', 'chr2_38325000_38330000',
       'chr2_38330000_38335000']

df[df.TSS.isin(prom_anc_arr)]
# df[df.TSS.isin(['chr2_38320000_38325000'])]

Unnamed: 0,chr,start,stop,TSS,chr_m,start_m,stop_m,id_trim,score,strand,tf,vocab,tissue,config,gene


In [24]:
loop_df[loop_df.source.isin(['chr2_38465000_38470000'])|loop_df.target.isin(['chr2_38465000_38470000'])]

Unnamed: 0,source,target,count
546518,chr2_38320000_38325000,chr2_38465000_38470000,22


In [27]:
tissue='CAL27-CTRLi'
list_genes = ['CUL9']
anc_prom = pybedtools.BedTool('../data/interim/annon/promoter_anchors/promoter_'+tissue+'_annon.bed').to_dataframe()
anc_prom = anc_prom[anc_prom.name.isin(list_genes)]
anc_prom_arr = anc_prom.thickEnd.unique()
    
loop_df = pd.read_csv('../data/interim/merged/loops/'+tissue+'.loops.csv',index_col=0)
loop_df = loop_df[loop_df.source.isin(anc_prom_arr)|loop_df.target.isin(anc_prom_arr)]
all_anc = sorted(set(anc_prom.thickEnd.to_list() +loop_df.source.to_list() + loop_df.target.to_list() ))

anc_foot = pybedtools.BedTool('../data/interim/annon/anchor_footprinting/'+tissue+'_annon.bed').to_dataframe()
anc_foot['tf'] = anc_foot.thickEnd.map(tf_id_to_name_dict)
anc_foot = anc_foot[anc_foot.tf.isin(vocab.split('::'))]
anc_foot = anc_foot[anc_foot.name.isin(all_anc)]


pro_anc_foot_df = pybedtools.BedTool('../data/interim/annon/promoter_footprinting/promoter_'+tissue+'_annon.bed').to_dataframe()
pro_anc_foot_df['tf'] = pro_anc_foot_df.thickEnd.map(tf_id_to_name_dict)
pro_anc_foot = pd.DataFrame()

for gene in list_genes:
    pro_anc_foot1 = pro_anc_foot_df[(pro_anc_foot_df.name==gene)|(pro_anc_foot_df.tf.isin(vocab.split('::')))]# not perfect will catch later???
    pro_anc_foot1['anchor'] = pro_anc_foot_df.score + "_" + pro_anc_foot_df.strand.map(str)+"_"+ pro_anc_foot_df.thickStart.map(str)
    pro_anc_foot1 = pro_anc_foot1[pro_anc_foot1.anchor.isin(all_anc)]
    pro_anc_foot1.drop(columns=['anchor'],inplace=True)
    pro_anc_foot = pd.concat([pro_anc_foot, pro_anc_foot1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
anc_foot

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,tf
16,chr1,740000,745000,chr1_740000_745000,chr1,740299,740305,ARNT_HUMAN.H11MO.0.B.pwm.trim,10.397177,-,.,ARNT
56,chr1,760000,765000,chr1_760000_765000,chr1,762629,762650,SP1_HUMAN.H11MO.0.A.pwm.trim,6.556889,+,.,SP1
110,chr1,760000,765000,chr1_760000_765000,chr1,762921,762931,SP1_HUMAN.H11MO.1.A.pwm.trim,18.026685,+,.,SP1
123,chr1,760000,765000,chr1_760000_765000,chr1,762960,762981,SP1_HUMAN.H11MO.0.A.pwm.trim,18.764170,+,.,SP1
127,chr1,760000,765000,chr1_760000_765000,chr1,762961,762971,SP1_HUMAN.H11MO.1.A.pwm.trim,11.564458,+,.,SP1
...,...,...,...,...,...,...,...,...,...,...,...,...
2553907,chrX,154255000,154260000,chrX_154255000_154260000,chrX,154255348,154255369,SP1_HUMAN.H11MO.0.A.pwm.trim,10.018741,+,.,SP1
2554020,chrX,154295000,154300000,chrX_154295000_154300000,chrX,154299437,154299458,SP1_HUMAN.H11MO.0.A.pwm.trim,14.421472,-,.,SP1
2554061,chrX,154295000,154300000,chrX_154295000_154300000,chrX,154299481,154299502,SP1_HUMAN.H11MO.0.A.pwm.trim,9.501379,-,.,SP1
2554076,chrX,154295000,154300000,chrX_154295000_154300000,chrX,154299491,154299501,SP1_HUMAN.H11MO.1.A.pwm.trim,11.706440,-,.,SP1
