In [4]:
import glob
import pandas
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "mfoster11@mgh.harvard.edu"

DOWNLOAD_GBS = False

In [5]:
#list_of_ids = "plasmid_seqs/plasmid_ids.txt"
output_dir = "plasmid_seqs/"

In [6]:
# WAIT, I've already done this....... Let me pull that over

# Ok so I've already filtered down this list of genbank files in the past
# I will use the accession id to name file I've made to set up something to map these id names to human readable names

acc_to_plasmids = pandas.read_csv("plasmid_seqs/acc_to_plasmid.tsv", delimiter="\t")
list_of_ids = acc_to_plasmids['ID'].to_list()
print(len(list_of_ids)) # 317
print(acc_to_plasmids)

317
             ID     Plasmid   Strain
0    CP124100.1  chromosome  NE_5261
1    CP124098.1        lp17  NE_5261
2    CP124097.1        cp26  NE_5261
3    CP124099.1        lp54  NE_5261
4    CP019844.1  chromosome     PAli
..          ...         ...      ...
312  AE001581.1      cp32-9      B31
313  AE000788.1        lp36      B31
314  AE000787.1        lp38      B31
315  AE000790.2        lp54      B31
316  AE001584.1        lp56      B31

[317 rows x 3 columns]


In [7]:
# # # IT KEPT FAILING BECAUSE PBSV2 WAS INCLUDED !!!! # # #
# example command
# with Entrez.efetch(db="nucleotide", id=f"{id}", rettype="gb", retmode="gbwithparts") as handle:
if DOWNLOAD_GBS is True:
    #already_downloaded = glob.glob(f"{output_dir}/*.gb")
    for id in list_of_ids:
        id = id.strip()
        #if f"{output_dir}/{id}.gb" in already_downloaded:
        #    print(f"Already downloaded {id}")
        #    pass
        #else:
        print(f"Downloading {id}")
        with Entrez.efetch(db="nucleotide", id=f"{id}", rettype="gb", retmode="gbwithparts") as gb_handle:
            record = SeqIO.read(gb_handle, "genbank")
            with open(f"{output_dir}/raw_gbs/{id}.gb", "w") as output_handle:
                SeqIO.write(record, output_handle, "genbank")
        print(f"Downloaded {id}")

In [8]:
# okay now let's see how many genbank files we have, should be 317 :)
genbanks = glob.glob("plasmid_seqs/raw_gbs/*.gb")
print(f"Found {len(genbanks)} genbank files") # 317 :)

Found 317 genbank files


In [9]:
list_of_plasmids = []
list_of_strains = []
# lets iterate through each and count our plasmids/strains!
for id in acc_to_plasmids["ID"]:
    id = id.split("/")[-1]
    genbank_file = f"plasmid_seqs/raw_gbs/{id}.gb"
    if genbank_file in genbanks:
        with open(genbank_file, "r") as f:
            record = SeqIO.parse(f, "genbank")
            for rec in record:
                strain = acc_to_plasmids[acc_to_plasmids["ID"] == rec.id]["Strain"].values[0]
                plasmid = acc_to_plasmids[acc_to_plasmids["ID"] == rec.id]["Plasmid"].values[0]
                list_of_plasmids.append(plasmid)
                list_of_strains.append(strain)
print(len(list_of_plasmids))# 317
print(len(list_of_strains))# 317
print(len(set(list_of_plasmids)))# 40
print(len(set(list_of_strains)))# 26

317
317
40
26


In [10]:
# And now we create a list of pf32 names for which to search
pf32_names = [
    "parA",
    "ParA",
    "ParA family",
    "BscQ",
    "bscQ",
    "PF32",
    "Pfam32",
    "Pfam-32",
    "PF-32",
    "plasmid partition protein",
]
putative_names = [
    "Putative",
    "Possible",
    "putative",
    "possible",
]

In [11]:
pf32_genes = {}
putatives = {}
# Now we will iterate through the genbank files and look for the pf32 genes
for id in acc_to_plasmids["ID"]:
    id = id.split("/")[-1]
    genbank_file = f"plasmid_seqs/raw_gbs/{id}.gb"
    if genbank_file in genbanks:
        with open(genbank_file, "r") as f:
            record = SeqIO.parse(f, "genbank")
            for rec in record:
                strain = acc_to_plasmids[acc_to_plasmids["ID"] == rec.id]["Strain"].values[0]
                plasmid = acc_to_plasmids[acc_to_plasmids["ID"] == rec.id]["Plasmid"].values[0]
                if plasmid not in pf32_genes:
                    pf32_genes[plasmid] = []
                if plasmid not in putatives:
                    putatives[plasmid] = []
                par_count = 1
                for feature in rec.features:
                    if feature.type == "CDS":
                        product = feature.qualifiers['product']
                        if any(name in product[0] for name in pf32_names) and not any(name in product[0] for name in putative_names):
                            nucseq = feature.extract(rec.seq)
                            #print(feature)
                            if len(nucseq) % 3 != 0:
                                print(f"Sequence length not divisible by 3 for {rec.id}---{sample_id}")
                                nucseq += "N"
                                print("adding N!")
                            protseq = nucseq.translate(table=11, to_stop=True)
                            if 'protein_id' in feature.qualifiers:
                                prot_id = str(feature.qualifiers['protein_id'][0].strip())
                            else:
                                prot_id = "NA"
                            gene = SeqIO.SeqRecord(protseq, id=f"{strain}_{plasmid}_ParA_{par_count}", name=product[0], description=f"{rec.id}-{prot_id}", dbxrefs=None)
                            pf32_genes[plasmid].append(gene)
                            par_count += 1
                        else:
                            if any(name in product[0] for name in pf32_names) and any(name in product[0] for name in putative_names):
                                nucseq = feature.extract(rec.seq)
                                #print(feature)
                                if len(nucseq) % 3 != 0:
                                    print(f"Sequence length not divisible by 3 for {rec.id}---{sample_id}")
                                    nucseq += "N"
                                    print("adding N!")
                                protseq = nucseq.translate(table=11, to_stop=True)
                                if 'protein_id' in feature.qualifiers:
                                    prot_id = str(feature.qualifiers['protein_id'][0].strip())
                                else:
                                    prot_id = "NA"
                                gene = SeqIO.SeqRecord(protseq, id=f"{strain}_{plasmid}_ParA_{par_count}_PUTATIVE", name=product[0], description=f"{rec.id}-{prot_id}", dbxrefs=None)
                                putatives[plasmid].append(gene)
                                par_count += 1


In [12]:
import pprint

pprint.pprint(pf32_genes)

{'chromosome': [SeqRecord(seq=Seq('MEDQAQSLRDMMRLNGKFNFSVDEKVQNSKTRFIAVSSGKGGVGKSNIAIGLAL...GVE'), id='NE_5261_chromosome_ParA_1', name='MinD/ParA family protein', description='CP124100.1-WKC97063.1', dbxrefs=[]),
                SeqRecord(seq=Seq('LIIIPVASGKGGVGKSLFSTNIAICLANEGKSVLLVDLDLGASNLHSMLNIIPK...KEV'), id='NE_5261_chromosome_ParA_2', name='MinD/ParA family protein', description='CP124100.1-WKC97145.1', dbxrefs=[]),
                SeqRecord(seq=Seq('MKIISVINQKGGVGKTTSAINISYSMTLLNKKILLIDIDSQGNSTSGTNTSKHI...IKE'), id='NE_5261_chromosome_ParA_3', name='ParA family protein', description='CP124100.1-WKC97211.1', dbxrefs=[]),
                SeqRecord(seq=Seq('MTKIIPVASGKGGVGKTSFVANVGYKLSSLGKTVILVDLDLGGSNLHTCLGVKN...IRK'), id='NE_5261_chromosome_ParA_4', name='MinD/ParA family protein', description='CP124100.1-WKC97483.1', dbxrefs=[]),
                SeqRecord(seq=Seq('MDIKKPDIIALTSVKGGVGKSTLSILFSYLLKELGKKILLIDLDPQNSLTSYFT...FSL'), id='JD1_chromosome_ParA_2', name='CobQ/CobB/MinD/P

In [156]:
pprint.pprint(putatives)

{'chromosome': [SeqRecord(seq=Seq('MKIISVINQKGGVGKTTSAINISYSMTLLNKKILLIDIDSQGNSTSGTNTSEYI...IKE'), id='JD1_chromosome_ParA_PUTATIVE_1', name='CobQ/CobB/MinD/ParA nucleotide binding domain, putative', description='CP002312.1-ADQ30850.1', dbxrefs=[]),
                SeqRecord(seq=Seq('MEKKENKKDIILYKRVETQIEEVDINQNRELKNYNELKEQLAYNLKLDIDSKFQ...KEE'), id='JD1_chromosome_ParA_PUTATIVE_3', name='Putative plasmid partition protein', description='CP002312.1-ADQ30832.1', dbxrefs=[]),
                SeqRecord(seq=Seq('LHLFNKENTSYKELFLKHKLERVFPNYNFNYVIIDTPPNLDSLLDNALNITNRL...LGI'), id='MM1_chromosome_ParA_PUTATIVE_2', name='plasmid partition protein, putative', description='CP031412.1-AXK70830.1', dbxrefs=[]),
                SeqRecord(seq=Seq('MKIISVINQKGGVGKTTSAINISYSMTLLNKKILLIDIDSQGNSTSGTNTSKHI...IKE'), id='ZS7_chromosome_ParA_PUTATIVE_1', name='putative CobQ/CobB/MinD/ParA nucleotide binding domain protein', description='CP001205.1-ACK74566.1', dbxrefs=[]),
                SeqRecord(seq=Seq(

In [13]:
print(acc_to_plasmids[acc_to_plasmids["Plasmid"] == "lp5"])
print(acc_to_plasmids[acc_to_plasmids["Plasmid"] == "lp28-8"])

             ID Plasmid   Strain
106  CP001462.1     lp5  WI91-23
296  AE001583.1     lp5      B31
            ID Plasmid Strain
65  CP031409.1  lp28-8    MM1


In [18]:
for plasmid in pf32_genes:
    if pf32_genes[plasmid] == []:
        print(f"No ParA genes found for {plasmid}")
        continue
    print(f"Writing {plasmid}")
    with open(f"plasmid_seqs/parA_multi/{plasmid}_pf32_genes.fasta", "w") as output_handle:
        SeqIO.write(pf32_genes[plasmid], output_handle, "fasta")
    print(f"Wrote {plasmid}")

Writing chromosome
Wrote chromosome
Writing lp17
Wrote lp17
Writing cp26
Wrote cp26
Writing lp54
Wrote lp54
Writing cp32-1
Wrote cp32-1
Writing cp32-3
Wrote cp32-3
Writing cp32-4
Wrote cp32-4
Writing cp32-5
Wrote cp32-5
Writing cp32-9
Wrote cp32-9
Writing lp36
Wrote lp36
Writing lp38
Wrote lp38
Writing lp56
Wrote lp56
Writing cp32-1+5
Wrote cp32-1+5
Writing cp32-10
Wrote cp32-10
Writing cp32-11
Wrote cp32-11
Writing cp32-12
Wrote cp32-12
Writing cp32-6
Wrote cp32-6
Writing cp32-8
Wrote cp32-8
Writing lp25
Wrote lp25
Writing lp28-1
Wrote lp28-1
Writing lp28-3
Wrote lp28-3
Writing lp28-4
Wrote lp28-4
Writing lp28-5
Wrote lp28-5
Writing lp28-6
Wrote lp28-6
Writing lp28-7
Wrote lp28-7
No ParA genes found for cp9
Writing lp21
Wrote lp21
Writing lp28-2
Wrote lp28-2
Writing cp32-7
Wrote cp32-7
Writing lp28-8
Wrote lp28-8
Writing cp32-2
Wrote cp32-2
Writing cp32-5-1
Wrote cp32-5-1
Writing cp32-9-4
Wrote cp32-9-4
No ParA genes found for cp9-3
No ParA genes found for lp5
Writing lp21-cp9
Wrote l

In [19]:
for plasmid in putatives:
    print(f"Writing {plasmid}")
    with open(f"plasmid_seqs/putative_multi/{plasmid}_pf32_genes_putative.fasta", "w") as output_handle:
        SeqIO.write(putatives[plasmid], output_handle, "fasta")
    print(f"Wrote {plasmid}")

Writing chromosome
Wrote chromosome
Writing lp17
Wrote lp17
Writing cp26
Wrote cp26
Writing lp54
Wrote lp54
Writing cp32-1
Wrote cp32-1
Writing cp32-3
Wrote cp32-3
Writing cp32-4
Wrote cp32-4
Writing cp32-5
Wrote cp32-5
Writing cp32-9
Wrote cp32-9
Writing lp36
Wrote lp36
Writing lp38
Wrote lp38
Writing lp56
Wrote lp56
Writing cp32-1+5
Wrote cp32-1+5
Writing cp32-10
Wrote cp32-10
Writing cp32-11
Wrote cp32-11
Writing cp32-12
Wrote cp32-12
Writing cp32-6
Wrote cp32-6
Writing cp32-8
Wrote cp32-8
Writing lp25
Wrote lp25
Writing lp28-1
Wrote lp28-1
Writing lp28-3
Wrote lp28-3
Writing lp28-4
Wrote lp28-4
Writing lp28-5
Wrote lp28-5
Writing lp28-6
Wrote lp28-6
Writing lp28-7
Wrote lp28-7
Writing cp9
Wrote cp9
Writing lp21
Wrote lp21
Writing lp28-2
Wrote lp28-2
Writing cp32-7
Wrote cp32-7
Writing lp28-8
Wrote lp28-8
Writing cp32-2
Wrote cp32-2
Writing cp32-5-1
Wrote cp32-5-1
Writing cp32-9-4
Wrote cp32-9-4
Writing cp9-3
Wrote cp9-3
Writing lp5
Wrote lp5
Writing lp21-cp9
Wrote lp21-cp9
Writing 

In [2]:
!ls plasmid_seqs/parA_multi/
!cat plasmid_seqs/parA_multi/*.fasta > plasmid_seqs/all_pf32_genes.fasta
!cat plasmid_seqs/putative_multi/*.fasta > plasmid_seqs/all_pf32_genes_putative.fasta

all_pf32_genes.fasta        cp9-3_pf32_genes.fasta
all_pf32_genes_aligned.afa  cp9_pf32_genes.fasta
chromosome_pf32_genes.fasta lp17_pf32_genes.fasta
cp26_pf32_genes.fasta       lp21-cp9_pf32_genes.fasta
cp32-1+5_pf32_genes.fasta   lp21_pf32_genes.fasta
cp32-10_pf32_genes.fasta    lp25_pf32_genes.fasta
cp32-11_pf32_genes.fasta    lp28-11_pf32_genes.fasta
cp32-12_pf32_genes.fasta    lp28-1_pf32_genes.fasta
cp32-1_pf32_genes.fasta     lp28-2_pf32_genes.fasta
cp32-2_pf32_genes.fasta     lp28-3_pf32_genes.fasta
cp32-3+10_pf32_genes.fasta  lp28-4_pf32_genes.fasta
cp32-3_pf32_genes.fasta     lp28-5_pf32_genes.fasta
cp32-4_pf32_genes.fasta     lp28-6_pf32_genes.fasta
cp32-5+1_pf32_genes.fasta   lp28-7_pf32_genes.fasta
cp32-5-1_pf32_genes.fasta   lp28-8_pf32_genes.fasta
cp32-5_pf32_genes.fasta     lp28-9_pf32_genes.fasta
cp32-6_pf32_genes.fasta     lp36_pf32_genes.fasta
cp32-7_pf32_genes.fasta     lp38_pf32_genes.fasta
cp32-8_pf32_genes.fasta     lp54_pf32_genes.fasta
cp32-9-4_pf32_genes.fasta

In [1]:
#import subprocess
# okay now lets generate an MSA from both of those multi fasta files containing all of the parA genes.add()
# using muscle
# JUST RUN THESE OUTSIDE OF JUPYTER
cmdparA = "muscle -align plasmid_seqs/all_pf32_genes.fasta -output plasmid_seqs/all_pf32_genes_aligned.afa"
#
cmdput = "muscle -align plasmid_seqs/all_pf32_genes_putative.fasta -output plasmid_seqs/all_pf32_genes_aligned.afa"

#process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#stdout, stderr = process.communicate()
#
#print("stdout:", stdout.decode())
#print("stderr:", stderr.decode())
