In [77]:
from tempfile import gettempdir, NamedTemporaryFile
import biotite.database.entrez as entrez
import biotite.sequence.io.fasta as fasta
from biotite.sequence import NucleotideSequence
from tqdm import tqdm
from plannotate.resources import get_resource
import pandas as pd
import subprocess
%load_ext autoreload
%autoreload 2

def parse_pipe(sseqid):
    if '|' in sseqid:
        return sseqid.split('|')[1]
    else:
        return sseqid

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# search NCBI

In [82]:
gis = entrez.search(
    entrez.SimpleQuery("Channelrhodopsin"),
    db_name="protein",
    number=150,
    )

In [83]:
filepaths = entrez.fetch(gis, gettempdir(), suffix="fa",
    db_name="protein", ret_type="fasta")

recs = []
for fp in tqdm(filepaths):
    recs.append(fasta.FastaFile.read(fp))

100%|██████████| 150/150 [00:00<00:00, 9773.29it/s]


In [84]:
# recs = []

In [85]:
new_fasta = fasta.FastaFile()
for rec in recs:
    new_fasta.update(rec)

# add custom sequences

In [88]:
customs = {
    'stanford-ts':  'aagagcaggatcaccagcgagggcgagtacatccccctggaccagatcgacatcaacgtg',
    'kv2.1-ts': 'cagtcccagcccatcctcaacaccaaggagatggccccgcagagcaagcctccagaggagctggagatgagcagcatgcccagccccgtggcccctctgcccgcacgcacggagggcgtcatcgacatgcggagcatgtccagcattgacagcttcatcagctgtgccacggacttccctgaagccaccagattc',
    'HA-tag': 'tacccatacgatgttccagattacgcttaa',
    'p2A-linker': 'gccacgaacttctctctgttaaagcaagcaggagacgtggaagaaaaccccggtccc',
    'GCaMP6m': 'atggtggactcatcacgtcgtaagtggaataagacaggtcacgcagtcagagctataggtcggctgagctcactcgagaacgtctatatcaaggccgacaagcagaagaacggcatcaaggcgaacttcaagatccgccacaacatcgaggacggcggcgtgcagctcgcctaccactaccagcagaacacccccatcggcgacggccccgtgctgctgcccgacaaccactacctgagcgtgcagtccaaactttcgaaagaccccaacgagaagcgcgatcacatggtcctgctggagttcgtgaccgccgccgggatcactctcggcatggacgagctgtacaagggcggtaccggagggagcatggtgagcaagggcgaggagctgttcaccggggtggtgcccatcctggtcgagctggacggcgacgtaaacggccacaagttcagcgtgtccggcgagggtgagggcgatgccacctacggcaagctgaccctgaagttcatctgcaccaccggcaagctgcccgtgccctggcccaccctcgtgaccaccctgacctacggcgtgcagtgcttcagccgctaccccgaccacatgaagcagcacgacttcttcaagtccgccatgcccgaaggctacatccaggagcgcaccatcttcttcaaggacgacggcaactacaagacccgcgccgaggtgaagttcgagggcgacaccctggtgaaccgcatcgagctgaagggcatcgacttcaaggaggacggcaacatcctggggcacaagctggagtacaacctgccggaccaactgactgaagagcagatcgcagaatttaaagaggctttctccctatttgacaaggacggggatgggacaataacaaccaaggagctggggacggtgatgcggtctctggggcagaaccccacagaagcagagctgcaggacatgatcaatgaagtagatgccgacggtgacggcacaatcgacttccctgagttcctgacaatgatggcaagaaaagggagctacagggacacggaagaagaaattagagaagcgttcggtgtgtttgataaggatggcaatggctacatcagtgcagcagagcttcgccacgtgatgacaaaccttggagagaagttaacagatgaagaggttgatgaaatgatcagggaagcagacatcgatggggatggtcaggtaaactacgaagagtttgtacaaatgatgacagcgaag',
    'FLASH': 'atgagcagcattacttgcgacccagccatctacggggagtggtcccgcgaaaaccagttctgcgtggagaagtctctgattaccctggatggaatcaaatatgtgcagctggtcatggcagtggtctctgcctgccaggtgttctttatggtcacccgggcccccaaggtgccttgggaggcaatctacctgcccaccacagaaatgatcacctatagtctggcctttacaggcaacgggtacatccaggtggctaatggcaaatatctgccttgggccaggatggccagctggctgtgcacatgtccaatcatgctggggctggtgtctaacatggccctggtcaagtacaaaagtattccactgaatcccatgatgattgccgcttctagtatctgcacagtgttcggcatcactgccagcgtggtcctggaccctctgcacgtgtggctgtactgtttcatctcaagcattttctttatcttcgagatggtggtcgcattcgccatttttgccatcactattcatgattttcagaccatcggaagccccatgtccctgaaggtggtcgaacggctgaaactgatgagaatcgtgttctacgtgagctggatggcatatcctatcctgtggagcttttcctctacaggggcctgcattatgtccgagaacaccagcagcgtgctgtacctgctgggcgacgccctgtgcaagcagacatatggcatcctgctgtgggcaactacctggggactgctgaacggcaagtgggaccgagattacgtgaaagggcggaacgtggacggaactctgatgcccgagtacgaacaggatctggagaaagga',
    'iC++': 'atggactatggcggcgctttgtctgccgtcggactcttccaaacctcatacactcttgagaacaatggttctgttatttgcatcccgaacaacggccagtgcttctgcttggcttggcttaaatccaacggaacaaatgccgagaagttggctgccaacattctgcagtggatttcttttgcgctttcagcgctctgcctgatgttctacggctaccagacctggaagtctacttgcggctgggagaatatttacgtggccacgatccagatgatcaagttcatcatcgagtatttccatagctttgacgaacctgcggtgatctactcatccaacggcaacaagaccaggtggcttcgttacgcgagctggctgctgacctgccctgtcattcttatccacctgagcaaccttacgggtctggcgaacgactataacaagcgtaccatgggtctgctggtgtcagatatcggcacgatcgtgtggggcaccacggccgcgctgtccaagggatacgtccgtgtcattttcttcctgatgggcctgtgctacggcatctacacattcttcaacgcagccaaggtctacattgaggcgtaccacaccgtgccgaagggccggtgtcgccaggtggtgactggcatggcttggctcttcttcgtatcatggggtatgttccccatcctgttcatcctcggccccgagggcttcggcgtcctgagcaggtacggctccaacgtcggccacaccatcattgacctgatgtcgaagcagtgctggggtctgctcggccactacctgcgcgtgctgatccacagccatatcctcatccacggcgacattcgcaagaccaccaaattgaacattggtggcactgagattgaggtcgagacgctggtggaggacgaggccgaggctggcgcggtc',
    'SwiChR++': 'atggactatggcggcgctttgtctgccgtcggactcttccaaacctcatacactcttgagaacaatggttctgttatttgcatcccgaacaacggccagtgcttctgcttggcttggcttaaatccaacggaacaaatgccgagaagttggctgccaacattctgcagtggatttcttttgcgctttcagcgctctgcctgatgttctacggctaccagacctggaagtctacttgcggctgggagaatatttacgtggccacgatccagatgatcaagttcatcatcgagtatttccatagctttgacgaacctgcggtgatctactcatccaacggcaacaagaccaggtggcttcgttacgcgagctggctgctgaccgcccctgtcattcttatccacctgagcaaccttacgggtctggcgaacgactataacaagcgtaccatgggtctgctggtgtcagatatcggcacgatcgtgtggggcaccacggccgcgctgtccaagggatacgtccgtgtcattttcttcctgatgggcctgtgctacggcatctacacattcttcaacgcagccaaggtctacattgaggcgtaccacaccgtgccgaagggccggtgtcgccaggtggtgactggcatggcttggctcttcttcgtatcatggggtatgttccccatcctgttcatcctcggccccgagggcttcggcgtcctgagcaggtacggctccaacgtcggccacaccatcattgacctgatgtcgaagcagtgctggggtctgctcggccactacctgcgcgtgctgatccacagccatatcctcatccacggcgacattcgcaagaccaccaaattgaacattggtggcactgagattgaggtcgagacgctggtggaggacgaggccgaggctggcgcggtc',
    'VChR1': 'atggattaccctgttgctagaagcctcatagttcgctacccaaccgacctcggaaacggcaccgtctgcatgccaagaggacagtgttactgtgaaggttggcttcggagtcgcggcacttccattgaaaagacaatagcaattactcttcagtgggtagtctttgctttgtcagtggcttgcctggggtggtatgcgtatcaagcgtggcgagctacctgcggatgggaggaggtttacgtagccttgatagaaatgatgaaaagcatcatcgaggccttccacgagttcgacagccctgcaacactgtggctgtcttcagggaacggcgtagtttggatgcggtatggcgaatggctcctcacctgcccggtccttctgatccatctgagcaacctcacaggcctgaaggacgattatagcaaaaggactatgggcctgttggtttctgatgtgggatgcatcgtgtggggcgcaaccagcgccatgtgtacggggtggacgaagatcctgttcttcctcatctcattgagctatggtatgtatacctattttcatgctgctaaagtttatatcgaagcattccacacagttccaaaagggatttgtcgagaactggtccgagtgatggcctggacattctttgtggcttggggaatgtttccagtcctgtttctgctgggcacggaaggattcggtcatatcagcccttatggatctgccattgggcactccatcctcgacctgattgcaaagaacatgtggggtgtgctggggaattacctgcgcgtcaaaatccacgagcacatcctgttgtatggcgacatcagaaagaagcagaaaattacgatcgccggccaagagatggaggttgagacactggtggctgaagaggaggac',
    'bReaches': 'atggactatggcggcgctttgtctgccgtcggactgttccagaccagctacaccctggagaacaacggcagcgtgatctgcatccccaacaacggccagtgcttctgcctggcctggctgaagagtaacggcaccaacgccgagaagctggccgccaacatcctgcagtgggtggtgtttgcgctgagcgtggcgtgcctgggctggtatgcgtatcaggcgtggcgcgcgacctgcggctgggaagaagtgtatgtggcgctgattgaaatgatgaaaagcattattgaagcgtttcatgaatttgatagcccggcgaccctgtggctgagcagcggcaacggcgtggtgtggatgcgctatggctcatggctgctgacctgccccgtgattctgattcatctgagcaacctgaccggcctgaaagatgattatagcaaacgcaccatgggcctgctggtgagcgacgtgggctgcattgtgtggggcgcgaccagcgcgatgtgcaccggctggaccaaaattctgttttttctgattagcctgagctatggcatgtatacctattttcatgcggccaaagtgtatattgaagcgtttcataccgtgccgaaaggcctgtgcagacagctggtgagagccatggcctggctgttcttcgtgagctggggcatgttccccgtgctgttcctgctgggccccgagggcttcggccatattagcccgtatggcagcgcgattggccatagcattctggatctgattgcgaagaacatgtggggcgtgctgggcaactatctgcgcgtgaaaattcatgaacatattctgctgtatggcgatattcgcaaaaaacagaaaattaccattgcgggccaggaaatggaagtggaaaccctggtggcggaagaagaagat',
    'C1V1': 'atgtcgcggaggccatggcttcttgccctagcgctggcagtggcgctggcggccggcagcgcaggagcctcgactggcagtgacgcgacggtgccggtcgcgactcaggatggccccgactacgttttccaccgtgcccacgagcgcatgctcttccaaacctcatacactcttgagaacaatggttctgttatttgcatcccgaacaacggccagtgcttctgcttggcttggcttaaatccaacggaacaaatgccgagaagttggctgccaacattctgcagtggattacttttgcgctttcagcgctctgcctgatgttctacggctaccagacctggaagtctacttgcggctgggaggagatttacgtggccacgatcgagatgatcaagttcatcatcgagtatttccatgagtttgacgaacctgcggtgatctactcatccaacggcaacaagaccgtgtggcttcgttacgcggagtggctgctcacctgcccggtccttctgatccatctgagcaacctcacaggcctgaaggacgattatagcaaaaggactatgggcctgttggtttctgatgtgggatgcatcgtgtggggcgcaaccagcgccatgtgtacggggtggacgaagatcctgttcttcctcatctcattgagctatggtatgtatacctattttcatgctgctaaagtttatatcgaagcattccacacagttccaaaagggatttgtcgagaactggtccgagtgatggcctggacattctttgtggcttggggaatgtttccagtcctgtttctgctgggcacggaaggattcggtcatatcagcccttatggatctgccattgggcactccatcctcgacctgattgcaaagaacatgtggggtgtgctggggaattacctgcgcgtcaaaatccacgagcacatcctgttgtatggcgacatcagaaagaagcagaaaattacgatcgccggccaagagatggaggttgagacactggtggctgaagaggaggac',
    'C1V1(E122T)': 'atgtcgcggaggccatggcttcttgccctagcgctggcagtggcgctggcggccggcagcgcaggagcctcgactggcagtgacgcgacggtgccggtcgcgactcaggatggccccgactacgttttccaccgtgcccacgagcgcatgctcttccaaacctcatacactcttgagaacaatggttctgttatttgcatcccgaacaacggccagtgcttctgcttggcttggcttaaatccaacggaacaaatgccgagaagttggctgccaacattctgcagtggattacttttgcgctttcagcgctctgcctgatgttctacggctaccagacctggaagtctacttgcggctgggagaccatttacgtggccacgatcgagatgatcaagttcatcatcgagtatttccatgagtttgacgaacctgcggtgatctactcatccaacggcaacaagaccgtgtggcttcgttacgcgacctggctgctcacctgcccggtccttctgatccatctgagcaacctcacaggcctgaaggacgattatagcaaaaggactatgggcctgttggtttctgatgtgggatgcatcgtgtggggcgcaaccagcgccatgtgtacggggtggacgaagatcctgttcttcctcatctcattgagctatggtatgtatacctattttcatgctgctaaagtttatatcgaagcattccacacagttccaaaagggatttgtcgagaactggtccgagtgatggcctggacattctttgtggcttggggaatgtttccagtcctgtttctgctgggcacggaaggattcggtcatatcagcccttatggatctgccattgggcactccatcctcgacctgattgcaaagaacatgtggggtgtgctggggaattacctgcgcgtcaaaatccacgagcacatcctgttgtatggcgacatcagaaagaagcagaaaattacgatcgccggccaagagatggaggttgagacactggtggctgaagaggaggac',
    'C1V1(E162T)': 'atgtcgcggaggccatggcttcttgccctagcgctggcagtggcgctggcggccggcagcgcaggagcctcgactggcagtgacgcgacggtgccggtcgcgactcaggatggccccgactacgttttccaccgtgcccacgagcgcatgctcttccaaacctcatacactcttgagaacaatggttctgttatttgcatcccgaacaacggccagtgcttctgcttggcttggcttaaatccaacggaacaaatgccgagaagttggctgccaacattctgcagtggattacttttgcgctttcagcgctctgcctgatgttctacggctaccagacctggaagtctacttgcggctgggaggagatttacgtggccacgatcgagatgatcaagttcatcatcgagtatttccatgagtttgacgaacctgcggtgatctactcatccaacggcaacaagaccgtgtggcttcgttacgcgacctggctgctcacctgcccggtccttctgatccatctgagcaacctcacaggcctgaaggacgattatagcaaaaggactatgggcctgttggtttctgatgtgggatgcatcgtgtggggcgcaaccagcgccatgtgtacggggtggacgaagatcctgttcttcctcatctcattgagctatggtatgtatacctattttcatgctgctaaagtttatatcgaagcattccacacagttccaaaagggatttgtcgagaactggtccgagtgatggcctggacattctttgtggcttggggaatgtttccagtcctgtttctgctgggcacggaaggattcggtcatatcagcccttatggatctgccattgggcactccatcctcgacctgattgcaaagaacatgtggggtgtgctggggaattacctgcgcgtcaaaatccacgagcacatcctgttgtatggcgacatcagaaagaagcagaaaattacgatcgccggccaagagatggaggttgagacactggtggctgaagaggaggac',
    'C1V1(E122T/E162T)': 'atgtcgcggaggccatggcttcttgccctagcgctggcagtggcgctggcggccggcagcgcaggagcctcgactggcagtgacgcgacggtgccggtcgcgactcaggatggccccgactacgttttccaccgtgcccacgagcgcatgctcttccaaacctcatacactcttgagaacaatggttctgttatttgcatcccgaacaacggccagtgcttctgcttggcttggcttaaatccaacggaacaaatgccgagaagttggctgccaacattctgcagtggattacttttgcgctttcagcgctctgcctgatgttctacggctaccagacctggaagtctacttgcggctgggagaccatttacgtggccacgatcgagatgatcaagttcatcatcgagtatttccatgagtttgacgaacctgcggtgatctactcatccaacggcaacaagaccgtgtggcttcgttacgcgacctggctgctcacctgcccggtccttctgatccatctgagcaacctcacaggcctgaaggacgattatagcaaaaggactatgggcctgttggtttctgatgtgggatgcatcgtgtggggcgcaaccagcgccatgtgtacggggtggacgaagatcctgttcttcctcatctcattgagctatggtatgtatacctattttcatgctgctaaagtttatatcgaagcattccacacagttccaaaagggatttgtcgagaactggtccgagtgatggcctggacattctttgtggcttggggaatgtttccagtcctgtttctgctgggcacggaaggattcggtcatatcagcccttatggatctgccattgggcactccatcctcgacctgattgcaaagaacatgtggggtgtgctggggaattacctgcgcgtcaaaatccacgagcacatcctgttgtatggcgacatcagaaagaagcagaaaattacgatcgccggccaagagatggaggttgagacactggtggctgaagaggaggac',
    'SSFO-hChR2(C128S/D156A)': 'atggactatggcggcgctttgtctgccgtcggacgcgaacttttgttcgttactaatcctgtggtggtgaacgggtccgtcctggtccctgaggatcaatgttactgtgccggatggattgaatctcgcggcacgaacggcgctcagaccgcgtcaaatgtcctgcagtggcttgcagcaggattcagcattttgctgctgatgttctatgcctaccaaacctggaaatctacatgcggctgggaggagatctatgtgtgcgccattgaaatggttaaggtgattctcgagttcttttttgagtttaagaatccctctatgctctaccttgccacaggacaccgggtgcagtggctgcgctatgcagagtggctgctcacttctcctgtcatccttatccacctgagcaacctcaccggcctgagcaacgactacagcaggagaaccatgggactccttgtctcagccatcgggactatcgtgtggggggctaccagcgccatggcaaccggctatgttaaagtcatcttcttttgtcttggattgtgctatggcgcgaacacattttttcacgccgccaaagcatatatcgagggttatcatactgtgccaaagggtcggtgccgccaggtcgtgaccggcatggcatggctgtttttcgtgagctggggtatgttcccaattctcttcattttggggcccgaaggttttggcgtcctgagcgtctatggctccaccgtaggtcacacgattattgatctgatgagtaaaaattgttgggggttgttgggacactacctgcgcgtcctgatccacgagcacatattgattcacggagatatccgcaaaaccaccaaactgaacatcggcggaacggagatcgaggtcgagactctcgtcgaagacgaagccgaggccggagccgtgcca',
    'hChR2(E123A)': 'atggactatggcggcgctttgtctgccgtcggacgcgaacttttgttcgttactaatcctgtggtggtgaacgggtccgtcctggtccctgaggatcaatgttactgtgccggatggattgaatctcgcggcacgaacggcgctcagaccgcgtcaaatgtcctgcagtggcttgcagcaggattcagcattttgctgctgatgttctatgcctaccaaacctggaaatctacatgcggctgggaggagatctatgtgtgcgccattgaaatggttaaggtgattctcgagttcttttttgagtttaagaatccctctatgctctaccttgccacaggacaccgggtgcagtggctgcgctatgcagagtggctgctcacttctcctgtcatccttatccacctgagcaacctcaccggcctgagcaacgactacagcaggagaaccatgggactccttgtctcagccatcgggactatcgtgtggggggctaccagcgccatggcaaccggctatgttaaagtcatcttcttttgtcttggattgtgctatggcgcgaacacattttttcacgccgccaaagcatatatcgagggttatcatactgtgccaaagggtcggtgccgccaggtcgtgaccggcatggcatggctgtttttcgtgagctggggtatgttcccaattctcttcattttggggcccgaaggttttggcgtcctgagcgtctatggctccaccgtaggtcacacgattattgatctgatgagtaaaaattgttgggggttgttgggacactacctgcgcgtcctgatccacgagcacatattgattcacggagatatccgcaaaaccaccaaactgaacatcggcggaacggagatcgaggtcgagactctcgtcgaagacgaagccgaggccggagccgtgcca',
    'hChR2(E123T/T159C)': 'atggactatggcggcgctttgtctgccgtcggacgcgaacttttgttcgttactaatcctgtggtggtgaacgggtccgtcctggtccctgaggatcaatgttactgtgccggatggattgaatctcgcggcacgaacggcgctcagaccgcgtcaaatgtcctgcagtggcttgcagcaggattcagcattttgctgctgatgttctatgcctaccaaacctggaaatctacatgcggctgggaggagatctatgtgtgcgccattgaaatggttaaggtgattctcgagttcttttttgagtttaagaatccctctatgctctaccttgccacaggacaccgggtgcagtggctgcgctatgcaacctggctgctcacttgtcctgtcatccttatccacctgagcaacctcaccggcctgagcaacgactacagcaggagaaccatgggactccttgtctcagacatcgggtgtatcgtgtggggggctaccagcgccatggcaaccggctatgttaaagtcatcttcttttgtcttggattgtgctatggcgcgaacacattttttcacgccgccaaagcatatatcgagggttatcatactgtgccaaagggtcggtgccgccaggtcgtgaccggcatggcatggctgtttttcgtgagctggggtatgttcccaattctcttcattttggggcccgaaggttttggcgtcctgagcgtctatggctccaccgtaggtcacacgattattgatctgatgagtaaaaattgttgggggttgttgggacactacctgcgcgtcctgatccacgagcacatattgattcacggagatatccgcaaaaccaccaaactgaacatcggcggaacggagatcgaggtcgagactctcgtcgaagacgaagccgaggccggagccgtgcca',
    'hChR2(T159C)': 'atggactatggcggcgctttgtctgccgtcggacgcgaacttttgttcgttactaatcctgtggtggtgaacgggtccgtcctggtccctgaggatcaatgttactgtgccggatggattgaatctcgcggcacgaacggcgctcagaccgcgtcaaatgtcctgcagtggcttgcagcaggattcagcattttgctgctgatgttctatgcctaccaaacctggaaatctacatgcggctgggaggagatctatgtgtgcgccattgaaatggttaaggtgattctcgagttcttttttgagtttaagaatccctctatgctctaccttgccacaggacaccgggtgcagtggctgcgctatgcagagtggctgctcacttgtcctgtcatccttatccacctgagcaacctcaccggcctgagcaacgactacagcaggagaaccatgggactccttgtctcagacatcgggtgtatcgtgtggggggctaccagcgccatggcaaccggctatgttaaagtcatcttcttttgtcttggattgtgctatggcgcgaacacattttttcacgccgccaaagcatatatcgagggttatcatactgtgccaaagggtcggtgccgccaggtcgtgaccggcatggcatggctgtttttcgtgagctggggtatgttcccaattctcttcattttggggcccgaaggttttggcgtcctgagcgtctatggctccaccgtaggtcacacgattattgatctgatgagtaaaaattgttgggggttgttgggacactacctgcgcgtcctgatccacgagcacatattgattcacggagatatccgcaaaaccaccaaactgaacatcggcggaacggagatcgaggtcgagactctcgtcgaagacgaagccgaggccggagccgtgcca',
    'eNpHR-3.0': 'atgacagagaccctgcctcccgtgaccgagagtgccgtggcccttcaagccgaggttacccaaagggagttgttcgagttcgtgctgaacgaccctttgcttgcaagcagtctctatatcaacatcgcacttgcaggactgagtatactgctgttcgtttttatgacccgaggactcgatgatccacgggcaaaacttattgctgtgtcaaccatccttgtgcctgtcgtcagcattgcctcctacactggattggcgagcggcctgacaatttccgttcttgaaatgccagcgggccattttgcagaaggcagctcagtgatgctgggaggagaagaggtagatggtgtagtcaccatgtggggacggtatctcacctgggcactttccacgcccatgattctcctcgctctgggtctcctggccggaagcaatgctacaaagctcttcacagctatcactttcgatatcgctatgtgcgtgactggccttgccgcggccctgactacctcctcccacctcatgagatggttctggtacgctatcagttgtgcatgctttctggtggtcttgtatatcctgctggtggagtgggcacaggacgccaaagccgcgggaaccgctgacatgttcaataccctgaagctgttgacagtagtgatgtggctggggtatccaattgtgtgggctcttggagtcgagggtatcgcggtgttgcccgttggggtgacgagctggggatattctttcctggatatcgtggcaaagtacattttcgcattcttgctcctgaactatctgacgtcaaacgaatctgtcgtgtccggcagcattttggatgttccatctgcttctgggaccccggctgatgat',
    'eArch-3.0': 'atggaccccatcgctctgcaggctggttacgacctgctgggtgacggcagacctgaaactctgtggctgggcatcggcactctgctgatgctgattggaaccttctactttctggtccgcggatggggagtcaccgataaggatgcccgggaatattacgctgtgactatcctggtgcccggaatcgcatccgccgcatatctgtctatgttctttggtatcgggcttactgaggtgaccgtcgggggcgaaatgttggatatctattatgccaggtacgccgactggctgtttaccaccccacttctgctgctggatctggcccttctcgctaaggtggatcgggtgaccatcggcaccctggtgggtgtggacgccctgatgatcgtcactggcctcatcggagccttgagccacacggccatagccagatacagttggtggttgttctctacaatttgcatgatagtggtgctctattttctggctacatccctgcgatctgctgcaaaggagcggggccccgaggtggcatctacctttaacaccctgacagctctggtcttggtgctgtggaccgcttaccctatcctgtggatcataggcactgagggcgctggcgtggtgggcctgggcatcgaaactctgctgtttatggtgttggacgtgactgccaaggtcggctttggctttatcctgttgagatcccgggctattctgggcgacaccgaggcaccagaacccagtgccggtgccgatgtcagtgccgccgac',
    'eMac-3.0': 'atgatcgtggaccagttcgaggaggtgctgatgaagaccagccagctgttcccactgccaaccgctacccagagcgcccagccaacccacgtggcccccgtgccaaccgtgctgcccgacacccccatctacgagaccgtgggcgacagcggcagcaagaccctgtgggtggtgttcgtgctgatgctgatcgccagcgccgccttcaccgccctgagctggaagatccccgtgaacaggaggctgtaccacgtgatcaccaccatcatcaccctgaccgccgccctgagctacttcgctatggctaccggccacggagtggccctgaacaagatcgtgatcaggacccagcacgaccacgtgcccgacacctacgagaccgtgtaccgacaggtgtactacgccaggtacatcgactgggctatcaccaccccactgctgctgctggacctgggactgctggctggaatgagcggagcccacatcttcatggccatcgtggctgacctgatcatggtgctgaccggcctgttcgctgctttcggcagcgagggaaccccacagaagtggggatggtacaccatcgcctgcatcgcctacatcttcgtggtgtggcacctggtgctgaacggcggcgccaacgccagggtgaagggcgagaagctgaggagcttcttcgtggccatcggagcttacaccctgatcctgtggaccgcttacccaatcgtgtggggactggctgacggagctaggaagatcggagtggacggagagatcatcgcttacgctgtgctggacgtgctggctaagggagtgttcggagcttggctgctggtgacccacgccaacctgagggagagcgacgtggagctgaacggcttctgggccaacggcctgaacagggagggcgccatcaggatcggcgaggacgacggcgcc',
    'CHETA-hChR2(E123T/H134R)': 'atggattatggaggcgccctgagtgccgttgggcgcgagctgctatttgtaacgaacccagtagtcgtcaatggctctgtacttgtgcctgaggaccagtgttactgcgcgggctggattgagtcgcgtggcacaaacggtgcccaaacggcgtcgaacgtgctgcaatggcttgctgctggcttctccatcctactgcttatgttttacgcctaccaaacatggaagtcaacctgcggctgggaggagatctatgtgtgcgctatcgagatggtcaaggtgattctcgagttcttcttcgagtttaagaacccgtccatgctgtatctagccacaggccaccgcgtccagtggttgcgttacgccacctggcttctcacctgcccggtcattctcattcgcctgtcaaacctgacgggcttgtccaacgactacagcaggcgcaccatgggtctgcttgtgtctgatattggcacaattgtgtggggcgccacttccgccatggccaccggatacgtcaaggtcatcttcttctgcctgggtctgtgttatggtgctaacacgttctttcacgctgccaaggcctacatcgagggttaccacaccgtgccgaagggccggtgtcgccaggtggtgactggcatggcttggctcttcttcgtatcatggggtatgttccccatcctgttcatcctcggccccgagggcttcggcgtcctgagcgtgtacggctccaccgtcggccacaccatcattgacctgatgtcgaagaactgctggggtctgctcggccactacctgcgcgtgctgatccacgagcatatcctcatccacggcgacattcgcaagaccaccaaattgaacattggtggcactgagattgaggtcgagacgctggtggaggacgaggccgaggctggcgcggtaccc',
    'eNpHR-2.0': 'atgaggggtacgcccctgctcctcgtcgtctctctgttctctctgcttcaggacacagagaccctgcctcccgtgaccgagagtgccgtggcccttcaagccgaggttacccaaagggagttgttcgagttcgtgctgaacgaccctttgcttgcaagcagtctctatatcaacatcgcacttgcaggactgagtatactgctgttcgtttttatgacccgaggactcgatgatccacgggcaaaacttattgctgtgtcaaccatccttgtgcctgtcgtcagcattgcctcctacactggattggcgagcggcctgacaatttccgttcttgaaatgccagcgggccattttgcagaaggcagctcagtgatgctgggaggagaagaggtagatggtgtagtcaccatgtggggacggtatctcacctgggcactttccacgcccatgattctcctcgctctgggtctcctggccggaagcaatgctacaaagctcttcacagctatcactttcgatatcgctatgtgcgtgactggccttgccgcggccctgactacctcctcccacctcatgagatggttctggtacgctatcagttgtgcatgctttctggtggtcttgtatatcctgctggtggagtgggcacaggacgccaaagccgcgggaaccgctgacatgttcaataccctgaagctgttgacagtagtgatgtggctggggtatccaattgtgtgggctcttggagtcgagggtatcgcggtgttgcccgttggggtgacgagctggggatattctttcctggatatcgtggcaaagtacattttcgcattcttgctcctgaactatctgacgtcaaacgaatctgtcgtgtccggcagcattttggatgttccatctgcttctgggaccccggctgatgat',
    
}
translated = {}
for k,v in customs.items():
    # translate to amino acids
    aa = NucleotideSequence(v).translate(complete=True)
    translated[k] = str(aa)

new_fasta.update(translated)

In [89]:
filtered = {k:v for k, v in new_fasta.items() if not k.startswith('QWX')}
new_fasta = fasta.FastaFile()
new_fasta.update(filtered)

# make dataframe for csv

In [90]:
df = pd.DataFrame({'sseqid': list(new_fasta.keys()), 'Feature': list(new_fasta.keys()), 'Description': list(new_fasta.keys())})
df['sseqid'] = df['sseqid'].str.split(" ", n=2, expand=True)[0]


df['sseqid'] = df['sseqid'].apply(parse_pipe)
problem_name = "pdb\|(.*)\|"
df['sseqid'] = df['sseqid'].str.replace(problem_name, r"\1", regex=True)

idx = df['sseqid'] == 'QTV24999.1'
df.loc[idx, 'Description'] = 'Kir2.1 C terminal export sequence'
df.loc[idx, 'Feature'] = 'Kir2.1 C terminal export sequence'


df = df.drop_duplicates(subset=['sseqid'])

# save csv

In [91]:
custom_db_name = 'opsins'

In [92]:

save_fp = get_resource('data', f'{custom_db_name}.csv')
df.to_csv(save_fp, index=False)
print(f"Saved to {save_fp}")

Saved to /Users/alex/code/pLannotate/plannotate/data/data/opsins.csv


# save diamond db

In [93]:


fasta_fp = get_resource('data', f'{custom_db_name}.fasta')
db_fp = get_resource('BLAST_dbs', f'{custom_db_name}')
new_fasta.write(fasta_fp)
command = f'diamond makedb --in {fasta_fp} -d {db_fp}'
subprocess.call(command, shell=True)

diamond v2.1.7.161 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 10
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /Users/alex/code/pLannotate/plannotate/data/data/opsins.fasta
Opening the database file...  [0s]
Loading sequences...  [0s]
Masking sequences...  [0.002s]
Writing sequences...  [0s]
Hashing sequences...  [0s]
Loading sequences...  [0s]
Writing trailer...  [0s]
Closing the input file...  [0s]
Closing the database file...  [0s]

Database sequences  157
  Database letters  48862
     Database hash  0cadf2d67891eebdac18bb005b25d0ef
        Total time  0.003000s


0