# Information retrieved from the PhiSITE

Manipulation of the FASTA files originated in the PhiSITE database to retrieve the list of phages that have known promoters and terminators

In [1]:
import numpy as np
import pandas as pd
import re
from Bio import SeqIO
from collections import defaultdict

In [2]:
data = defaultdict(list)
with open("promotersphisite.txt") as fp:
  for record in SeqIO.parse(fp,"fasta"):
    #print(record.description)
    record.description= re.split(",", record.description)
    promoter = record.description[0]
    organism = record.description[1]
    RefSeq = record.description[2]
    position = record.description[3]
    sequence = str(record.seq)
    data['Promoter'].append(promoter)
    data['Organism'].append(organism)
    data['RefSeq'].append(RefSeq)
    data["Position"].append(position)
    data['Sequence'].append(sequence)

promoters_df = pd.DataFrame.from_dict(data)

In [3]:
promoters_df

Unnamed: 0,Promoter,Organism,RefSeq,Position,Sequence
0,A0(D) promoter,Enterobacteria phage T7,RefSeq:NC_001604,complement(214..269),AAGATAGGCGTTGACTTGATGGGTCTTTAGGTGTAGGCTTTAGGTG...
1,C promoter,Enterobacteria phage T7,RefSeq:NC_001604,(3068..3123),GATAAGCAACTTGACGCAATGTTAATGGGCTGATAGTCTTATCTTA...
2,A1 promoter,Enterobacteria phage T7,RefSeq:NC_001604,(452..508),AAAAAGAGTATTGACTTAAAGTCTAACCTATAGGATACTTACAGCC...
3,A2 promoter,Enterobacteria phage T7,RefSeq:NC_001604,(579..636),AAAACAGGTATTGACAACATGAAGTAACATGCAGTAAGATACAAAT...
4,A3 promoter,Enterobacteria phage T7,RefSeq:NC_001604,(704..760),AACAAAACGGTTGACAACATGAAGTAAACACGGTACGATGTACCAC...
...,...,...,...,...,...
483,phi11 promoter,Pseudomonad phage gh-1,RefSeq:NC_004665,(18091..18134),GGACCAAGCGAGATGGCGAGCACTACGGACGTTCACACGTTGAG
484,phi12 promoter,Pseudomonad phage gh-1,RefSeq:NC_004665,(18936..18979),GCCACTCAGGTTTAAAACCCTCACTATGGCTGCATGGAGACTTC
485,phi14 promoter,Pseudomonad phage gh-1,RefSeq:NC_004665,(21099..21242),ACATGCGCTCTTACGAAGCAACACTCGAAACAGATGACGAACTCGC...
486,phi15 promoter,Pseudomonad phage gh-1,RefSeq:NC_004665,(24136..24206),TGGGGGGCCTCAAAAACCCTCACTATGGCACCCTATGAGGGTTTCT...


In [4]:
promoters_df.dtypes

Promoter    object
Organism    object
RefSeq      object
Position    object
Sequence    object
dtype: object

In [5]:
promoters_df.describe()

Unnamed: 0,Promoter,Organism,RefSeq,Position,Sequence
count,488,488,488,488,488
unique,392,29,29,484,484
top,pR promoter,Enterobacteria phage T4,RefSeq:NC_000866,complement(42614..42661),ACATTGTGCATGGATGGTTCCAGTACAGAGCGTAATAATAAGGACA...
freq,8,159,159,2,2


In [26]:
phage_list = promoters_df['Organism']
phage_list=phage_list.drop_duplicates().sort_values()
print(len(phage_list))
phage_list

29


426                   Bacillus phage B103
221                   Bacillus phage GA-1
141                     Bacillus phage Nf
147                 Bacillus phage phi105
257                  Bacillus phage phi29
138             Enterobacteria phage 933W
158            Enterobacteria phage HK022
112               Enterobacteria phage Mu
24                Enterobacteria phage P1
190               Enterobacteria phage P2
149              Enterobacteria phage P22
198               Enterobacteria phage P4
204              Enterobacteria phage SP6
447               Enterobacteria phage T3
267               Enterobacteria phage T4
0                 Enterobacteria phage T7
176        Enterobacteria phage VT2-Sakai
128           Enterobacteria phage lambda
180     Escherichia Stx1 converting phage
218                   Mycoplasma phage P1
467                Pseudomonad phage gh-1
244              Pseudomonas phage phiKMV
251                Salmonella phage HK620
120              Staphylococcus ph

In [7]:
data1 = defaultdict(list)
with open("terminatorsphisite.txt") as fp:
  for record in SeqIO.parse(fp,"fasta"):
    #print(record.description)
    record.description= re.split(",", record.description)
    terminator = record.description[0]
    organism = record.description[1]
    RefSeq = record.description[2]
    position = record.description[3]
    sequence = str(record.seq)
    data1['Terminator'].append(terminator)
    data1['Organism'].append(organism)
    data1['RefSeq'].append(RefSeq)
    data1["Position"].append(position)
    data1['Sequence'].append(sequence)
    
terminator_df = pd.DataFrame.from_dict(data1)

In [8]:
terminator_df

Unnamed: 0,Terminator,Organism,RefSeq,Position,Sequence
0,Tphi terminator,Enterobacteria phage T7,RefSeq:NC_001604,(24160..24215),TAACTAGCATAACCCCTTGGGGCCTCTAAACGGGTCTTGAGGGGTT...
1,TE terminator,Enterobacteria phage T7,RefSeq:NC_001604,(7552..7591),TAATCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTT
2,CJ terminator,Enterobacteria phage T7,RefSeq:NC_001604,(151..177),CTGTGTCCCTATCTGTTACAGTCTCCT
3,tAsRef terminator,Enterobacteria phage P1,RefSeq:NC_005856,complement(1826..1864),CTTTTGTGCAGCCTGGCTCCTTGCCAGGCTTTTTTTTAT
4,tC8 terminator,Enterobacteria phage P1,RefSeq:NC_005856,(2036..2073),TGACATCATTGGCGGCCATTAGGCCGCCTTTTTTTTGC
...,...,...,...,...,...
172,TD1 terminator,Bacillus phage B103,RefSeq:NC_004165,(16981..17023),AACAAAAACACCTGCTGTTATAATAACGGCAGGCTTTTTAATA
173,TA1 terminator,Bacillus phage B103,RefSeq:NC_004165,complement(3710..3756),GAATCTGAATACGTGGTGTCTAACGGTGATGCCACGTTTTTCTTTTC
174,TD1r terminator,Bacillus phage B103,RefSeq:NC_004165,complement(16981..17023),TATTAAAAAGCCTGCCGTTATTATAACAGCAGGTGTTTTTGTT
175,Tphi terminator,Enterobacteria phage T3,RefSeq:NC_003298,(22342..22400),ACTAATATGCAAACCCCTTGGGTTCCCTCTTTGGGAGTCTGAGGGG...


In [9]:
terminator_df.dtypes

Terminator    object
Organism      object
RefSeq        object
Position      object
Sequence      object
dtype: object

In [10]:
terminator_df.describe()

Unnamed: 0,Terminator,Organism,RefSeq,Position,Sequence
count,177,177,177,177,177
unique,152,23,23,177,175
top,TA1 terminator,Enterobacteria phage T4,RefSeq:NC_000866,(24160..24215),CTACGAGTTTGCCAGCCTCCCCCAGTGGCTGGCTTTTTTATGT
freq,4,41,41,1,2


In [27]:
phage_list_t = terminator_df['Organism']
phage_list_t = phage_list_t.drop_duplicates().sort_values()
print(len(phage_list_t))
phage_list_t

23


172                   Bacillus phage B103
101                   Bacillus phage GA-1
56                      Bacillus phage Nf
128                  Bacillus phage phi29
39                Enterobacteria phage Mu
3                 Enterobacteria phage P1
82                Enterobacteria phage P2
58               Enterobacteria phage P22
87                Enterobacteria phage P4
93               Enterobacteria phage SP6
175               Enterobacteria phage T3
131               Enterobacteria phage T4
0                 Enterobacteria phage T7
48            Enterobacteria phage lambda
79      Escherichia Stx1 converting phage
100                   Mycoplasma phage P1
109              Pseudomonas phage phiKMV
112                Salmonella phage HK620
46               Staphylococcus phage P68
107              Streptococcus phage Cp-1
76              Streptomyces phage phiC31
80                Stx2 converting phage I
81               Stx2 converting phage II
Name: Organism, dtype: object

In [28]:
phage_list = list(phage_list)
phage_list_t = list(phage_list_t)
for t in phage_list_t:
    if t not in phage_list:
        phage_list.append(t)
print(len(phage_list))
phage_list

29


[' Bacillus phage B103',
 ' Bacillus phage GA-1',
 ' Bacillus phage Nf',
 ' Bacillus phage phi105',
 ' Bacillus phage phi29',
 ' Enterobacteria phage 933W',
 ' Enterobacteria phage HK022',
 ' Enterobacteria phage Mu',
 ' Enterobacteria phage P1',
 ' Enterobacteria phage P2',
 ' Enterobacteria phage P22',
 ' Enterobacteria phage P4',
 ' Enterobacteria phage SP6',
 ' Enterobacteria phage T3',
 ' Enterobacteria phage T4',
 ' Enterobacteria phage T7',
 ' Enterobacteria phage VT2-Sakai',
 ' Enterobacteria phage lambda',
 ' Escherichia Stx1 converting phage',
 ' Mycoplasma phage P1',
 ' Pseudomonad phage gh-1',
 ' Pseudomonas phage phiKMV',
 ' Salmonella phage HK620',
 ' Staphylococcus phage P68',
 ' Streptococcus phage Cp-1',
 ' Streptomyces phage phiC31',
 ' Stx2 converting phage I',
 ' Stx2 converting phage II',
 ' Yersinia phage phiYeO3-12']

In [29]:
phage_list_file = open("bacteriohages_list.txt","w")
for b in phage_list:
    phage_list_file.write(b+"\n")
phage_list_file.close()