In [1]:
import pandas as pd

In [2]:
two_way_blast = pd.read_csv('./MG_BW_TWBH.txt', sep='\t')

In [3]:
# The intergenic regions in MG1655 and BW25113 were extracted and saved in fasta files. 
# two way reciprocal blast analysis were performed to determine the conservation of the intergenic regions
# If a promoter region in MG1655 and BW25113 are 100% conserved, then the TF binding sites are assigned to BW25113
# The position of the TF binding sites are then corrected according to the BW25113 genome. 


# result of the two-way-blast
# 1.0 means 100% conserved
two_way_blast.head()

Unnamed: 0,gene_id,CP009273_intergenic_score,CP009273_intergenic.ffn,NC_000913_intergenic_score,NC_000913_intergenic.ffn
0,forb0001_forb0002|255_336,1.0,forBW25113_0001_forBW25113_0002|255_336,1.0,forb0001_forb0002|255_336
1,forb0004_forb0005|5020_5233,1.0,forBW25113_0004_forBW25113_0005|5020_5233,1.0,forb0004_forb0005|5020_5233
2,forb0005_revb0006|5530_5682,1.0,forBW25113_0005_revBW25113_0006|5530_5682,1.0,forb0005_revb0006|5530_5682
3,revb0006_revb0007|6459_6528,1.0,revBW25113_0006_revBW25113_0007|6459_6528,1.0,revb0006_revb0007|6459_6528
4,revb0007_forb0008|7959_8237,1.0,revBW25113_0007_forBW25113_0008|7959_8237,1.0,revb0007_forb0008|7959_8237


In [7]:
# reading the MG1655 TF. binding site from EcoCyc, which I save in a bed file
TF = pd.read_csv('./Ecocyc_binding_motifs.bed', sep='\t', names=['accession','start','stop',
                                                                'TF_name','score','strand',
                                                                'st','sp','color'])

In [8]:
# this is the bed file format
TF.head()

Unnamed: 0,accession,start,stop,TF_name,score,strand,st,sp,color
0,NC_000913,17384,17401,NhaR-Na+DNA-binding-site,1000,+,17384,17401,25500
1,NC_000913,17406,17423,NhaR-Na+DNA-binding-site,1000,+,17406,17423,25500
2,NC_000913,17416,17433,NhaR-Na+DNA-binding-site,1000,+,17416,17433,25500
3,NC_000913,17447,17464,NhaR-Na+DNA-binding-site,1000,+,17447,17464,25500
4,NC_000913,17894,17927,YdeO,1000,+,17894,17927,25500


In [9]:

### BW ###

# extract the starts of the intergenic regions
CP009273_start=two_way_blast['CP009273_intergenic.ffn'].str.split('|', expand=True)[1].str.split('_', expand=True)[0]

#extract the end of intergenic regions
CP009273_end=two_way_blast['CP009273_intergenic.ffn'].str.split('|', expand=True)[1].str.split('_', expand=True)[1]

# add those to the TWB dataframe
two_way_blast['CP009273_start']=CP009273_start
two_way_blast['CP009273_end']=CP009273_end



### MG ###

#NC_000913_start=two_way_blast['NC_000913_intergenic.ffn'].str.split('|', expand=True)[1].str.split('_', expand=True)[0]

#NC_000913_end=two_way_blast['NC_000913_intergenic.ffn'].str.split('|', expand=True)[1].str.split('_', expand=True)[1]

#two_way_blast['NC_000913_start']=NC_000913_start
#two_way_blast['NC_000913_end']=NC_000913_end

In [10]:

two_way_blast.dropna(subset=['NC_000913_start','NC_000913_end'], inplace=True)

In [11]:
# assigning TFBS to BW and correcting the TFBS positions. 
tf_start=[]
tf_stop=[]
intergenic_start=[]
tf_name=[]

for i1,r1 in TF.iterrows():
    for i,r in two_way_blast.iterrows():
        
        start_end=list(range(int(r['NC_000913_start']),int(r['NC_000913_end'])))
        if r1['start'] in start_end or r1['stop'] in start_end:
            tf_start.append(r1['start'])
            tf_stop.append(r1['stop'])
            intergenic_start.append(r['NC_000913_start'])
            tf_name.append(r1['TF_name'])

        

In [12]:
len(tf_start)

2196

In [13]:
tf_MG_df = pd.DataFrame({"tf_start":tf_start, "tf_end":tf_stop, 
                         "intergenic_start":intergenic_start,
                        "tf_name": tf_name})

In [14]:
tf_MG_df.head()

Unnamed: 0,tf_start,tf_end,intergenic_start,tf_name
0,17384,17401,16903,NhaR-Na+DNA-binding-site
1,17406,17423,16903,NhaR-Na+DNA-binding-site
2,17416,17433,16903,NhaR-Na+DNA-binding-site
3,17447,17464,16903,NhaR-Na+DNA-binding-site
4,28252,28272,28207,ArgP


In [15]:
both=pd.merge(tf_MG_df, two_way_blast, left_on='intergenic_start', right_on='NC_000913_start')

In [16]:
len(both)

2196

In [17]:
both['tf_length']=both.tf_end - both.tf_start

both.intergenic_start=both.intergenic_start.astype(int)

both['from_start']=both.tf_start - both.intergenic_start

both_final=both[both['CP009273_intergenic_score']==1.0]

In [21]:
both_final.CP009273_start = both_final.CP009273_start.astype(int)
both_final.CP009273_end = both_final.CP009273_end.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [22]:
both_final['CP009273_TF_start']= both_final['CP009273_start']+ both['from_start']
both_final['CP009273_TF_stop']=both_final['CP009273_TF_start']+ both['tf_length']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  both_final['CP009273_TF_start']= both_final['CP009273_start']+ both['from_start']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  both_final['CP009273_TF_stop']=both_final['CP009273_TF_start']+ both['tf_length']


In [24]:
both_final.head()

Unnamed: 0,tf_start,tf_end,intergenic_start,tf_name,gene_id,CP009273_intergenic_score,CP009273_intergenic.ffn,NC_000913_intergenic_score,NC_000913_intergenic.ffn,CP009273_start,CP009273_end,NC_000913_start,NC_000913_end,tf_length,from_start,CP009273_TF_start,CP009273_TF_stop
0,17384,17401,16903,NhaR-Na+DNA-binding-site,revb4412_forb0019|16903_17488,1.0,revBW25113_4412_forBW25113_0019|16903_17488,1.0,revb4412_forb0019|16903_17488,16903,17488,16903,17488,17,481,17384.0,17401.0
1,17406,17423,16903,NhaR-Na+DNA-binding-site,revb4412_forb0019|16903_17488,1.0,revBW25113_4412_forBW25113_0019|16903_17488,1.0,revb4412_forb0019|16903_17488,16903,17488,16903,17488,17,503,17406.0,17423.0
2,17416,17433,16903,NhaR-Na+DNA-binding-site,revb4412_forb0019|16903_17488,1.0,revBW25113_4412_forBW25113_0019|16903_17488,1.0,revb4412_forb0019|16903_17488,16903,17488,16903,17488,17,513,17416.0,17433.0
3,17447,17464,16903,NhaR-Na+DNA-binding-site,revb4412_forb0019|16903_17488,1.0,revBW25113_4412_forBW25113_0019|16903_17488,1.0,revb4412_forb0019|16903_17488,16903,17488,16903,17488,17,544,17447.0,17464.0
4,28252,28272,28207,ArgP,forb0030_forb0031|28207_28373,1.0,forBW25113_0030_forBW25113_0031|28207_28373,1.0,forb0030_forb0031|28207_28373,28207,28373,28207,28373,20,45,28252.0,28272.0


In [25]:
final = both_final[['tf_name','CP009273_TF_start', 'CP009273_TF_stop']]

In [26]:
final['accession']='CP009273'
final['score']=1000
final['strand']='+'
final['color']='255,0,0'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['accession']='CP009273'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['score']=1000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['strand']='+'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

In [27]:
final.sort_values(by='tf_name', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.sort_values(by='tf_name', inplace=True)


In [28]:
tf_list=final.tf_name.unique()

In [29]:
final.tf_name=final.tf_name.replace('\/','-')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [30]:
for i in tf_list:
    df = final[final.tf_name == i]
    
    if len(df) > 15:
            
        try:
            with open('%s.bed'%i, 'w') as f:
                for i, r in df.iterrows():
                    f.write('%s\t%i\t%i\t%s\t%i\t%s\t%i\t%i\t%s\n'%(r['accession'], r['CP009273_TF_start'], r['CP009273_TF_stop'],
                           r['tf_name'], r['score'], r['strand'], r['CP009273_TF_start'], 
                            r['CP009273_TF_stop'],r['color']))
        except:
            pass

In [31]:
# making bed files for Wellington
for i in tf_list:
    df = final[final.tf_name == i]
    
    if len(df) > 15:
            
        try:
            with open('%s.modified.bed'%i, 'w') as f:
                for i, r in df.iterrows():
                    f.write('%s\t%i\t%i\t%s\n'%(r['accession'],r['CP009273_TF_start'], 
                                                r['CP009273_TF_stop'], r['strand']))                    
                                                                    
                                                                   
        except:
            pass

In [32]:
final=final.sort_values(by='CP009273_TF_start')

In [33]:
final.head()

Unnamed: 0,tf_name,CP009273_TF_start,CP009273_TF_stop,accession,score,strand,color
0,NhaR-Na+DNA-binding-site,17384.0,17401.0,CP009273,1000,+,25500
1,NhaR-Na+DNA-binding-site,17406.0,17423.0,CP009273,1000,+,25500
2,NhaR-Na+DNA-binding-site,17416.0,17433.0,CP009273,1000,+,25500
3,NhaR-Na+DNA-binding-site,17447.0,17464.0,CP009273,1000,+,25500
4,ArgP,28252.0,28272.0,CP009273,1000,+,25500


In [34]:
# finally writing the BW TF bed file
with open('CP009273.bed', 'w') as f:
    for i, r in final.iterrows():
        f.write('%s\t%i\t%i\t%s\t%i\t%s\t%i\t%i\t%s\n'%(r['accession'], r['CP009273_TF_start'], r['CP009273_TF_stop'],
               r['tf_name'], r['score'], r['strand'], r['CP009273_TF_start'], 
                r['CP009273_TF_stop'],r['color']))
