In [2]:
from urllib.request import urlretrieve
from pathlib import Path

taxon = "Agapetus"
raw_folder = Path("../data/raw/bold_api")
out_folder = Path("../data/processed/bold_api")

url = ("http://v3.boldsystems.org/index.php/API_Public/sequence?"
       f"taxon={taxon.replace(' ', '%20')}"
       "&marker=COI-5P"
       "&format=xml")

out_fname = raw_folder / f"{taxon.replace(' ', '_')}.fasta"

In [None]:
urlretrieve(url, filename=out_fname)

In [35]:
from Bio import SeqIO
from random import sample
from io import StringIO
import pandas as pd
import numpy as np

records = [r for r in SeqIO.parse(out_fname, "fasta")]
print(len(records))

# Find the lengths of sequences and filter too long and short sequences
lengths = np.asarray([len(r.seq) for r in records])
filter_ind = np.logical_and((lengths <= 658), (lengths >= 600))
records_subset0 = [records[i] for i in np.where(filter_ind)[0]]
print(len(records_subset0))

# Remove duplicate sequences
seen = set()
records_subset = []
for r in records_subset0:
    if r.seq not in seen:
        seen.add(r.seq)
        records_subset.append(r)

print(len(records_subset))

subset_name = raw_folder / f"{out_fname.stem}_subset.fasta"
with open(subset_name, "w") as output_handle:
    SeqIO.write(records_subset, output_handle, "fasta")
align_name = out_folder / f"{out_fname.stem}.afa"

652
596
359


In [19]:
seen = {}
for r in records:
    if r.seq not in seen.keys():
        seen[r.seq] = []
    seen[r.seq].append(r)

In [29]:
key = sorted([(len(seen[k]), k) for k in seen.keys()], key=lambda x:x[0])[::-1][0][1]
key

Seq('ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAAGA...---')

In [30]:
seen[key]

[SeqRecord(seq=Seq('ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAAGA...---'), id='GMNWL2436-14|Agapetus', name='GMNWL2436-14|Agapetus', description='GMNWL2436-14|Agapetus ochripes|COI-5P|KX142337', dbxrefs=[]),
 SeqRecord(seq=Seq('ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAAGA...---'), id='GMNWL2406-14|Agapetus', name='GMNWL2406-14|Agapetus', description='GMNWL2406-14|Agapetus ochripes|COI-5P|KX141029', dbxrefs=[]),
 SeqRecord(seq=Seq('ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAAGA...---'), id='GMNWL2389-14|Agapetus', name='GMNWL2389-14|Agapetus', description='GMNWL2389-14|Agapetus ochripes|COI-5P|KX141333', dbxrefs=[]),
 SeqRecord(seq=Seq('ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAAGA...---'), id='GMNWL2371-14|Agapetus', name='GMNWL2371-14|Agapetus', description='GMNWL2371-14|Agapetus ochripes|COI-5P|KX143274', dbxrefs=[]),
 SeqRecord(seq=Seq('ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAAGA...---'), id='GMNWL2401-14|Agapetus', name='GMNWL2401-14|A

408

In [22]:
import subprocess
subprocess.run([r"..\muscle.exe",
                    "-align", str(subset_name), 
                    "-output", str(align_name)]
                    )


In [7]:
records_align = [r for r in SeqIO.parse(align_name, "fasta")]
[len(r.seq) for r in records_align]

[667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667,
 667

In [9]:
records_align

[SeqRecord(seq=Seq('-AACTTTATACTTTATTTTCGGAATTTGAAGAGGAATAGTCGGAACTTCTCTAA...TTT'), id='LTUT077-11|Agapetus|COI-5P|KX140757', name='LTUT077-11|Agapetus|COI-5P|KX140757', description='LTUT077-11|Agapetus|COI-5P|KX140757', dbxrefs=[]),
 SeqRecord(seq=Seq('-----------TTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAA...--T'), id='GMNWL2377-14|Agapetus', name='GMNWL2377-14|Agapetus', description='GMNWL2377-14|Agapetus ochripes|COI-5P|KX142868', dbxrefs=[]),
 SeqRecord(seq=Seq('-AACTTTATATTTTATTTTTGGAATTTGAAGAGGAATAGTAGGAACTTCTTTAA...TTC'), id='BDRTR027-12|Agapetus', name='BDRTR027-12|Agapetus', description='BDRTR027-12|Agapetus sp.|COI-5P|KX294420', dbxrefs=[]),
 SeqRecord(seq=Seq('-AACTTTATATTTTATTTTTGGTATTTGAAGAGGAATAGTAGGAACCTCACTAA...TTT'), id='HIEPT107-10|Agapetus', name='HIEPT107-10|Agapetus', description='HIEPT107-10|Agapetus cf. rossi|COI-5P|HQ958585', dbxrefs=[]),
 SeqRecord(seq=Seq('--ACGTTATATTTCATTTTTGGAATTTGAAGAGGTATAGTAGGAACTTCTTTAA...---'), id='GMNWL2383-14|Agapetus', name='GMNWL23